aho-corasick-1.1.3/.cargo_vcs_info.json0000644000000001360000000000100134010ustar { "git": { "sha1": "56256dca1bcd2365fd1dc987c1c06195429a2e2c" }, "path_in_vcs": "" }aho-corasick-1.1.3/.github/workflows/ci.yml000064400000000000000000000130751046102023000167120ustar 00000000000000name: ci on: pull_request: push: branches: - master schedule: - cron: '00 01 * * *' # The section is needed to drop write-all permissions that are granted on # `schedule` event. By specifying any permission explicitly all others are set # to none. By using the principle of least privilege the damage a compromised # workflow can do (because of an injection or compromised third party tool or # action) is restricted. Currently the worklow doesn't need any additional # permission except for pulling the code. Adding labels to issues, commenting # on pull-requests, etc. may need additional permissions: # # Syntax for this section: # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#permissions # # Reference for how to assign permissions on a job-by-job basis: # https://docs.github.com/en/actions/using-jobs/assigning-permissions-to-jobs # # Reference for available permissions that we can enable if needed: # https://docs.github.com/en/actions/security-guides/automatic-token-authentication#permissions-for-the-github_token permissions: # to fetch code (actions/checkout) contents: read jobs: test: name: test env: # For some builds, we use cross to test on 32-bit and big-endian # systems. CARGO: cargo # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`. # Note that we only use cross on Linux, so setting a target on a # different OS will just use normal cargo. TARGET: # Bump this as appropriate. We pin to a version to make sure CI # continues to work as cross releases in the past have broken things # in subtle ways. CROSS_VERSION: v0.2.5 runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: include: - build: pinned os: ubuntu-latest rust: 1.60.0 - build: stable os: ubuntu-latest rust: stable - build: stable-x86 os: ubuntu-latest rust: stable target: i686-unknown-linux-gnu - build: stable-aarch64 os: ubuntu-latest rust: stable target: aarch64-unknown-linux-gnu - build: stable-powerpc64 os: ubuntu-latest rust: stable target: powerpc64-unknown-linux-gnu - build: stable-s390x os: ubuntu-latest rust: stable target: s390x-unknown-linux-gnu - build: beta os: ubuntu-latest rust: beta - build: nightly os: ubuntu-latest rust: nightly - build: macos os: macos-latest rust: stable - build: win-msvc os: windows-latest rust: stable - build: win-gnu os: windows-latest rust: stable-x86_64-gnu steps: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.rust }} - name: Install and configure Cross if: matrix.os == 'ubuntu-latest' && matrix.target != '' run: | # In the past, new releases of 'cross' have broken CI. So for now, we # pin it. We also use their pre-compiled binary releases because cross # has over 100 dependencies and takes a bit to compile. dir="$RUNNER_TEMP/cross-download" mkdir "$dir" echo "$dir" >> $GITHUB_PATH cd "$dir" curl -LO "https://github.com/cross-rs/cross/releases/download/$CROSS_VERSION/cross-x86_64-unknown-linux-musl.tar.gz" tar xf cross-x86_64-unknown-linux-musl.tar.gz # We used to install 'cross' from master, but it kept failing. So now # we build from a known-good version until 'cross' becomes more stable # or we find an alternative. Notably, between v0.2.1 and current # master (2022-06-14), the number of Cross's dependencies has doubled. echo "CARGO=cross" >> $GITHUB_ENV echo "TARGET=--target ${{ matrix.target }}" >> $GITHUB_ENV - name: Show command used for Cargo run: | echo "cargo command is: ${{ env.CARGO }}" echo "target flag is: ${{ env.TARGET }}" - name: Show CPU info for debugging if: matrix.os == 'ubuntu-latest' run: lscpu # See: https://github.com/rust-lang/regex/blob/a2887636930156023172e4b376a6febad4e49120/.github/workflows/ci.yml#L145-L163 - name: Pin memchr to 2.6.2 if: matrix.build == 'pinned' run: cargo update -p memchr --precise 2.6.2 - run: ${{ env.CARGO }} build --verbose $TARGET - run: ${{ env.CARGO }} doc --verbose $TARGET - run: ${{ env.CARGO }} test --verbose $TARGET - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features std,perf-literal $TARGET - run: ${{ env.CARGO }} test --lib --verbose --no-default-features $TARGET - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features std $TARGET - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features perf-literal $TARGET - run: ${{ env.CARGO }} test --lib --verbose --no-default-features --features std,perf-literal,logging $TARGET - if: matrix.build == 'nightly' run: ${{ env.CARGO }} build --manifest-path aho-corasick-debug/Cargo.toml $TARGET rustfmt: name: rustfmt runs-on: ubuntu-latest steps: - name: Checkout repository uses: actions/checkout@v3 - name: Install Rust uses: dtolnay/rust-toolchain@master with: toolchain: stable components: rustfmt - name: Check formatting run: | cargo fmt --all -- --check aho-corasick-1.1.3/.gitignore000064400000000000000000000002111046102023000141530ustar 00000000000000.*.swp doc tags examples/ss10pusa.csv build target /Cargo.lock scratch* bench_large/huge BREADCRUMBS /tmp /aho-corasick-debug/Cargo.lock aho-corasick-1.1.3/.vim/coc-settings.json000064400000000000000000000005451046102023000163430ustar 00000000000000{ "rust-analyzer.linkedProjects": [ "aho-corasick-debug/Cargo.toml", "benchmarks/engines/rust-aho-corasick/Cargo.toml", "benchmarks/engines/rust-daachorse/Cargo.toml", "benchmarks/engines/rust-jetscii/Cargo.toml", "benchmarks/engines/naive/Cargo.toml", "benchmarks/shared/Cargo.toml", "fuzz/Cargo.toml", "Cargo.toml" ] } aho-corasick-1.1.3/COPYING000064400000000000000000000001761046102023000132300ustar 00000000000000This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. aho-corasick-1.1.3/Cargo.toml0000644000000030320000000000100113750ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.60.0" name = "aho-corasick" version = "1.1.3" authors = ["Andrew Gallant "] exclude = [ "/aho-corasick-debug", "/benchmarks", "/tmp", ] autotests = false description = "Fast multiple substring searching." homepage = "https://github.com/BurntSushi/aho-corasick" readme = "README.md" keywords = [ "string", "search", "text", "pattern", "multi", ] categories = ["text-processing"] license = "Unlicense OR MIT" repository = "https://github.com/BurntSushi/aho-corasick" [package.metadata.docs.rs] all-features = true rustdoc-args = [ "--cfg", "docsrs", "--generate-link-to-definition", ] [profile.bench] debug = 2 [profile.release] debug = 2 [lib] name = "aho_corasick" [dependencies.log] version = "0.4.17" optional = true [dependencies.memchr] version = "2.4.0" optional = true default-features = false [dev-dependencies.doc-comment] version = "0.3.3" [features] default = [ "std", "perf-literal", ] logging = ["dep:log"] perf-literal = ["dep:memchr"] std = ["memchr?/std"] aho-corasick-1.1.3/Cargo.toml.orig000064400000000000000000000051531046102023000150640ustar 00000000000000[package] name = "aho-corasick" version = "1.1.3" #:version authors = ["Andrew Gallant "] description = "Fast multiple substring searching." homepage = "https://github.com/BurntSushi/aho-corasick" repository = "https://github.com/BurntSushi/aho-corasick" readme = "README.md" keywords = ["string", "search", "text", "pattern", "multi"] license = "Unlicense OR MIT" categories = ["text-processing"] autotests = false exclude = ["/aho-corasick-debug", "/benchmarks", "/tmp"] edition = "2021" rust-version = "1.60.0" [lib] name = "aho_corasick" [features] default = ["std", "perf-literal"] std = ["memchr?/std"] # Enables prefilter optimizations that depend on external crates. perf-literal = ["dep:memchr"] # Enable logging via the 'log' crate. This is useful for seeing messages about # internal decisions and metrics. For example, how the choice of the internal # Aho-Corasick implementation is used or the heap usage of an automaton. logging = ["dep:log"] # Provides a trait impl for fst::Automaton for nfa::noncontiguous::NFA, # nfa::contiguous::NFA and dfa::DFA. This is useful for searching an # FST with an Aho-Corasick automaton. Note that this does not apply # to the top-level 'AhoCorasick' type, as it does not implement the # aho_corasick::automaton::Automaton trait, and thus enabling this feature does # not cause it to implement fst::Automaton either. # # NOTE: Currently this feature is not available as `fst` is not at 1.0 yet, # and this would make `fst` a public dependency. If you absolutely need this, # you can copy the (very small) src/transducer.rs file to your tree. It # specifically does not use any private APIs and should work after replacing # 'crate::' with 'aho_corasick::'. # # NOTE: I think my current plan is to flip this around an add an optional # dependency on 'aho-corasick' to the 'fst' crate and move the trait impls # there. But I haven't gotten around to it yet. # transducer = ["fst"] [dependencies] log = { version = "0.4.17", optional = true } memchr = { version = "2.4.0", default-features = false, optional = true } [dev-dependencies] doc-comment = "0.3.3" # fst = "0.4.5" [package.metadata.docs.rs] # We want to document all features. all-features = true # This opts into a nightly unstable option to show the features that need to be # enabled for public API items. To do that, we set 'docsrs', and when that's # enabled, we enable the 'doc_auto_cfg' feature. # # To test this locally, run: # # RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features rustdoc-args = ["--cfg", "docsrs", "--generate-link-to-definition"] [profile.release] debug = true [profile.bench] debug = true aho-corasick-1.1.3/DESIGN.md000064400000000000000000000602371046102023000134740ustar 00000000000000This document describes the internal design of this crate, which is an object lesson in what happens when you take a fairly simple old algorithm like Aho-Corasick and make it fast and production ready. The target audience of this document is Rust programmers that have some familiarity with string searching, however, one does not need to know the Aho-Corasick algorithm in order to read this (it is explained below). One should, however, know what a trie is. (If you don't, go read its Wikipedia article.) The center-piece of this crate is an implementation of Aho-Corasick. On its own, Aho-Corasick isn't that complicated. The complex pieces come from the different variants of Aho-Corasick implemented in this crate. Specifically, they are: * Aho-Corasick as a noncontiguous NFA. States have their transitions represented sparsely, and each state puts its transitions in its own separate allocation. Hence the same "noncontiguous." * Aho-Corasick as a contiguous NFA. This NFA uses a single allocation to represent the transitions of all states. That is, transitions are laid out contiguously in memory. Moreover, states near the starting state are represented densely, such that finding the next state ID takes a constant number of instructions. * Aho-Corasick as a DFA. In this case, all states are represented densely in a transition table that uses one allocation. * Supporting "standard" match semantics, along with its overlapping variant, in addition to leftmost-first and leftmost-longest semantics. The "standard" semantics are typically what you see in a textbook description of Aho-Corasick. However, Aho-Corasick is also useful as an optimization in regex engines, which often use leftmost-first or leftmost-longest semantics. Thus, it is useful to implement those semantics here. The "standard" and "leftmost" search algorithms are subtly different, and also require slightly different construction algorithms. * Support for ASCII case insensitive matching. * Support for accelerating searches when the patterns all start with a small number of fixed bytes. Or alternatively, when the patterns all contain a small number of rare bytes. (Searching for these bytes uses SIMD vectorized code courtesy of `memchr`.) * Transparent support for alternative SIMD vectorized search routines for smaller number of literals, such as the Teddy algorithm. We called these "packed" search routines because they use SIMD. They can often be an order of magnitude faster than just Aho-Corasick, but don't scale as well. * Support for searching streams. This can reuse most of the underlying code, but does require careful buffering support. * Support for anchored searches, which permit efficient "is prefix" checks for a large number of patterns. When you combine all of this together along with trying to make everything as fast as possible, what you end up with is enitrely too much code with too much `unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead, we will explain it. # Basics The fundamental problem this crate is trying to solve is to determine the occurrences of possibly many patterns in a haystack. The naive way to solve this is to look for a match for each pattern at each position in the haystack: for i in 0..haystack.len(): for p in patterns.iter(): if haystack[i..].starts_with(p.bytes()): return Match(p.id(), i, i + p.bytes().len()) Those four lines are effectively all this crate does. The problem with those four lines is that they are very slow, especially when you're searching for a large number of patterns. While there are many different algorithms available to solve this, a popular one is Aho-Corasick. It's a common solution because it's not too hard to implement, scales quite well even when searching for thousands of patterns and is generally pretty fast. Aho-Corasick does well here because, regardless of the number of patterns you're searching for, it always visits each byte in the haystack exactly once. This means, generally speaking, adding more patterns to an Aho-Corasick automaton does not make it slower. (Strictly speaking, however, this is not true, since a larger automaton will make less effective use of the CPU's cache.) Aho-Corasick can be succinctly described as a trie with state transitions between some of the nodes that efficiently instruct the search algorithm to try matching alternative keys in the trie. The trick is that these state transitions are arranged such that each byte of input needs to be inspected only once. These state transitions are typically called "failure transitions," because they instruct the searcher (the thing traversing the automaton while reading from the haystack) what to do when a byte in the haystack does not correspond to a valid transition in the current state of the trie. More formally, a failure transition points to a state in the automaton that may lead to a match whose prefix is a proper suffix of the path traversed through the trie so far. (If no such proper suffix exists, then the failure transition points back to the start state of the trie, effectively restarting the search.) This is perhaps simpler to explain pictorally. For example, let's say we built an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The trie looks like this: a - S1 - b - S2 - c - S3 - d - S4* / S0 - c - S5 - e - S6 - f - S7* where states marked with a `*` are match states (meaning, the search algorithm should stop and report a match to the caller). So given this trie, it should be somewhat straight-forward to see how it can be used to determine whether any particular haystack *starts* with either `abcd` or `cef`. It's easy to express this in code: fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool { let mut state_id = trie.start(); // If the empty pattern is in trie, then state_id is a match state. if trie.is_match(state_id) { return true; } for (i, &b) in haystack.iter().enumerate() { state_id = match trie.next_state(state_id, b) { Some(id) => id, // If there was no transition for this state and byte, then we know // the haystack does not start with one of the patterns in our trie. None => return false, }; if trie.is_match(state_id) { return true; } } false } And that's pretty much it. All we do is move through the trie starting with the bytes at the beginning of the haystack. If we find ourselves in a position where we can't move, or if we've looked through the entire haystack without seeing a match state, then we know the haystack does not start with any of the patterns in the trie. The meat of the Aho-Corasick algorithm is in how we add failure transitions to our trie to keep searching efficient. Specifically, it permits us to not only check whether a haystack *starts* with any one of a number of patterns, but rather, whether the haystack contains any of a number of patterns *anywhere* in the haystack. As mentioned before, failure transitions connect a proper suffix of the path traversed through the trie before, with a path that leads to a match that has a prefix corresponding to that proper suffix. So in our case, for patterns `abcd` and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from the diagram above) from `S3` upon seeing that the byte following `c` is not `d`. Namely, the proper suffix in this example is `c`, which is a prefix of `cef`. So the modified diagram looks like this: a - S1 - b - S2 - c - S3 - d - S4* / / / ---------------- / / S0 - c - S5 - e - S6 - f - S7* One thing that isn't shown in this diagram is that *all* states have a failure transition, but only `S3` has a *non-trivial* failure transition. That is, all other states have a failure transition back to the start state. So if our haystack was `abzabcd`, then the searcher would transition back to `S0` after seeing `z`, which effectively restarts the search. (Because there is no pattern in our trie that has a prefix of `bz` or `z`.) The code for traversing this *automaton* or *finite state machine* (it is no longer just a trie) is not that much different from the `has_prefix` code above: fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool { let mut state_id = fsm.start(); // If the empty pattern is in fsm, then state_id is a match state. if fsm.is_match(state_id) { return true; } for (i, &b) in haystack.iter().enumerate() { // While the diagram above doesn't show this, we may wind up needing // to follow multiple failure transitions before we land on a state // in which we can advance. Therefore, when searching for the next // state, we need to loop until we don't see a failure transition. // // This loop terminates because the start state has no empty // transitions. Every transition from the start state either points to // another state, or loops back to the start state. loop { match fsm.next_state(state_id, b) { Some(id) => { state_id = id; break; } // Unlike our code above, if there was no transition for this // state, then we don't quit. Instead, we look for this state's // failure transition and follow that instead. None => { state_id = fsm.next_fail_state(state_id); } }; } if fsm.is_match(state_id) { return true; } } false } Other than the complication around traversing failure transitions, this code is still roughly "traverse the automaton with bytes from the haystack, and quit when a match is seen." And that concludes our section on the basics. While we didn't go deep into how the automaton is built (see `src/nfa/noncontiguous.rs`, which has detailed comments about that), the basic structure of Aho-Corasick should be reasonably clear. # NFAs and DFAs There are generally two types of finite automata: non-deterministic finite automata (NFA) and deterministic finite automata (DFA). The difference between them is, principally, that an NFA can be in multiple states at once. This is typically accomplished by things called _epsilon_ transitions, where one could move to a new state without consuming any bytes from the input. (The other mechanism by which NFAs can be in more than one state is where the same byte in a particular state transitions to multiple distinct states.) In contrast, a DFA can only ever be in one state at a time. A DFA has no epsilon transitions, and for any given state, a byte transitions to at most one other state. By this formulation, the Aho-Corasick automaton described in the previous section is an NFA. This is because failure transitions are, effectively, epsilon transitions. That is, whenever the automaton is in state `S`, it is actually in the set of states that are reachable by recursively following failure transitions from `S` until you reach the start state. (This means that, for example, the start state is always active since the start state is reachable via failure transitions from any state in the automaton.) NFAs have a lot of nice properties. They tend to be easier to construct, and also tend to use less memory. However, their primary downside is that they are typically slower to execute a search with. For example, the code above showing how to search with an Aho-Corasick automaton needs to potentially iterate through many failure transitions for every byte of input. While this is a fairly small amount of overhead, this can add up, especially if the automaton has a lot of overlapping patterns with a lot of failure transitions. A DFA's search code, by contrast, looks like this: fn contains(dfa: &DFA, haystack: &[u8]) -> bool { let mut state_id = dfa.start(); // If the empty pattern is in dfa, then state_id is a match state. if dfa.is_match(state_id) { return true; } for (i, &b) in haystack.iter().enumerate() { // An Aho-Corasick DFA *never* has a missing state that requires // failure transitions to be followed. One byte of input advances the // automaton by one state. Always. state_id = dfa.next_state(state_id, b); if dfa.is_match(state_id) { return true; } } false } The search logic here is much simpler than for the NFA, and this tends to translate into significant performance benefits as well, since there's a lot less work being done for each byte in the haystack. How is this accomplished? It's done by pre-following all failure transitions for all states for all bytes in the alphabet, and then building a single state transition table. Building this DFA can be much more costly than building the NFA, and use much more memory, but the better performance can be worth it. Users of this crate can actually choose between using one of two possible NFAs (noncontiguous or contiguous) or a DFA. By default, a contiguous NFA is used, in most circumstances, but if the number of patterns is small enough a DFA will be used. A contiguous NFA is chosen because it uses orders of magnitude less memory than a DFA, takes only a little longer to build than a noncontiguous NFA and usually gets pretty close to the search speed of a DFA. (Callers can override this automatic selection via the `AhoCorasickBuilder::start_kind` configuration.) # More DFA tricks As described in the previous section, one of the downsides of using a DFA is that it uses more memory and can take longer to build. One small way of mitigating these concerns is to map the alphabet used by the automaton into a smaller space. Typically, the alphabet of a DFA has 256 elements in it: one element for each possible value that fits into a byte. However, in many cases, one does not need the full alphabet. For example, if all patterns in an Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct bytes. As far as the automaton is concerned, the rest of the 204 bytes are indistinguishable from one another: they will never disrciminate between a match or a non-match. Therefore, in cases like that, the alphabet can be shrunk to just 53 elements. One for each ASCII letter, and then another to serve as a placeholder for every other unused byte. In practice, this library doesn't quite compute the optimal set of equivalence classes, but it's close enough in most cases. The key idea is that this then allows the transition table for the DFA to be potentially much smaller. The downside of doing this, however, is that since the transition table is defined in terms of this smaller alphabet space, every byte in the haystack must be re-mapped to this smaller space. This requires an additional 256-byte table. In practice, this can lead to a small search time hit, but it can be difficult to measure. Moreover, it can sometimes lead to faster search times for bigger automata, since it could be difference between more parts of the automaton staying in the CPU cache or not. One other trick for DFAs employed by this crate is the notion of premultiplying state identifiers. Specifically, the normal way to compute the next transition in a DFA is via the following (assuming that the transition table is laid out sequentially in memory, in row-major order, where the rows are states): next_state_id = dfa.transitions[current_state_id * 256 + current_byte] However, since the value `256` is a fixed constant, we can actually premultiply the state identifiers in the table when we build the table initially. Then, the next transition computation simply becomes: next_state_id = dfa.transitions[current_state_id + current_byte] This doesn't seem like much, but when this is being executed for every byte of input that you're searching, saving that extra multiplication instruction can add up. The same optimization works even when equivalence classes are enabled, as described above. The only difference is that the premultiplication is by the total number of equivalence classes instead of 256. There isn't much downside to premultiplying state identifiers, other than it imposes a smaller limit on the total number of states in the DFA. Namely, with premultiplied state identifiers, you run out of room in your state identifier representation more rapidly than if the identifiers are just state indices. Both equivalence classes and premultiplication are always enabled. There is a `AhoCorasickBuilder::byte_classes` configuration, but disabling this just makes it so there are always 256 equivalence classes, i.e., every class corresponds to precisely one byte. When it's disabled, the equivalence class map itself is still used. The purpose of disabling it is when one is debugging the underlying automaton. It can be easier to comprehend when it uses actual byte values for its transitions instead of equivalence classes. # Match semantics One of the more interesting things about this implementation of Aho-Corasick that (as far as this author knows) separates it from other implementations, is that it natively supports leftmost-first and leftmost-longest match semantics. Briefly, match semantics refer to the decision procedure by which searching will disambiguate matches when there are multiple to choose from: * **standard** match semantics emits matches as soon as they are detected by the automaton. This is typically equivalent to the textbook non-overlapping formulation of Aho-Corasick. * **leftmost-first** match semantics means that 1) the next match is the match starting at the leftmost position and 2) among multiple matches starting at the same leftmost position, the match corresponding to the pattern provided first by the caller is reported. * **leftmost-longest** is like leftmost-first, except when there are multiple matches starting at the same leftmost position, the pattern corresponding to the longest match is returned. (The crate API documentation discusses these differences, with examples, in more depth on the `MatchKind` type.) The reason why supporting these match semantics is important is because it gives the user more control over the match procedure. For example, leftmost-first permits users to implement match priority by simply putting the higher priority patterns first. Leftmost-longest, on the other hand, permits finding the longest possible match, which might be useful when trying to find words matching a dictionary. Additionally, regex engines often want to use Aho-Corasick as an optimization when searching for an alternation of literals. In order to preserve correct match semantics, regex engines typically can't use the standard textbook definition directly, since regex engines will implement either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics. Supporting leftmost semantics requires a couple key changes: * Constructing the Aho-Corasick automaton changes a bit in both how the trie is constructed and how failure transitions are found. Namely, only a subset of the failure transitions are added. Specifically, only the failure transitions that either do not occur after a match or do occur after a match but preserve that match are kept. (More details on this can be found in `src/nfa/noncontiguous.rs`.) * The search algorithm changes slightly. Since we are looking for the leftmost match, we cannot quit as soon as a match is detected. Instead, after a match is detected, we must keep searching until either the end of the input or until a dead state is seen. (Dead states are not used for standard match semantics. Dead states mean that searching should stop after a match has been found.) Most other implementations of Aho-Corasick do support leftmost match semantics, but they do it with more overhead at search time, or even worse, with a queue of matches and sophisticated hijinks to disambiguate the matches. While our construction algorithm becomes a bit more complicated, the correct match semantics fall out from the structure of the automaton itself. # Overlapping matches One of the nice properties of an Aho-Corasick automaton is that it can report all possible matches, even when they overlap with one another. In this mode, the match semantics don't matter, since all possible matches are reported. Overlapping searches work just like regular searches, except the state identifier at which the previous search left off is carried over to the next search, so that it can pick up where it left off. If there are additional matches at that state, then they are reported before resuming the search. Enabling leftmost-first or leftmost-longest match semantics causes the automaton to use a subset of all failure transitions, which means that overlapping searches cannot be used. Therefore, if leftmost match semantics are used, attempting to do an overlapping search will return an error (or panic when using the infallible APIs). Thus, to get overlapping searches, the caller must use the default standard match semantics. This behavior was chosen because there are only two alternatives, which were deemed worse: * Compile two automatons internally, one for standard semantics and one for the semantics requested by the caller (if not standard). * Create a new type, distinct from the `AhoCorasick` type, which has different capabilities based on the configuration options. The first is untenable because of the amount of memory used by the automaton. The second increases the complexity of the API too much by adding too many types that do similar things. It is conceptually much simpler to keep all searching isolated to a single type. # Stream searching Since Aho-Corasick is an automaton, it is possible to do partial searches on partial parts of the haystack, and then resume that search on subsequent pieces of the haystack. This is useful when the haystack you're trying to search is not stored contiguously in memory, or if one does not want to read the entire haystack into memory at once. Currently, only standard semantics are supported for stream searching. This is some of the more complicated code in this crate, and is something I would very much like to improve. In particular, it currently has the restriction that it must buffer at least enough of the haystack in memory in order to fit the longest possible match. The difficulty in getting stream searching right is that the implementation choices (such as the buffer size) often impact what the API looks like and what it's allowed to do. # Prefilters In some cases, Aho-Corasick is not the fastest way to find matches containing multiple patterns. Sometimes, the search can be accelerated using highly optimized SIMD routines. For example, consider searching the following patterns: Sherlock Moriarty Watson It is plausible that it would be much faster to quickly look for occurrences of the leading bytes, `S`, `M` or `W`, before trying to start searching via the automaton. Indeed, this is exactly what this crate will do. When there are more than three distinct starting bytes, then this crate will look for three distinct bytes occurring at any position in the patterns, while preferring bytes that are heuristically determined to be rare over others. For example: Abuzz Sanchez Vasquez Topaz Waltz Here, we have more than 3 distinct starting bytes, but all of the patterns contain `z`, which is typically a rare byte. In this case, the prefilter will scan for `z`, back up a bit, and then execute the Aho-Corasick automaton. If all of that fails, then a packed multiple substring algorithm will be attempted. Currently, the only algorithm available for this is Teddy, but more may be added in the future. Teddy is unlike the above prefilters in that it confirms its own matches, so when Teddy is active, it might not be necessary for Aho-Corasick to run at all. However, the current Teddy implementation only works in `x86_64` when SSSE3 or AVX2 are available or in `aarch64` (using NEON), and moreover, only works _well_ when there are a small number of patterns (say, less than 100). Teddy also requires the haystack to be of a certain length (more than 16-34 bytes). When the haystack is shorter than that, Rabin-Karp is used instead. (See `src/packed/rabinkarp.rs`.) There is a more thorough description of Teddy at [`src/packed/teddy/README.md`](src/packed/teddy/README.md). aho-corasick-1.1.3/LICENSE-MIT000064400000000000000000000020711046102023000136250ustar 00000000000000The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. aho-corasick-1.1.3/README.md000064400000000000000000000133311046102023000134510ustar 00000000000000aho-corasick ============ A library for finding occurrences of many patterns at once with SIMD acceleration in some cases. This library provides multiple pattern search principally through an implementation of the [Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), which builds a finite state machine for executing searches in linear time. Features include case insensitive matching, overlapping matches, fast searching via SIMD and optional full DFA construction and search & replace in streams. [![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions) [![crates.io](https://img.shields.io/crates/v/aho-corasick.svg)](https://crates.io/crates/aho-corasick) Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). ### Documentation https://docs.rs/aho-corasick ### Usage Run `cargo add aho-corasick` to automatically add this crate as a dependency in your `Cargo.toml` file. ### Example: basic searching This example shows how to search for occurrences of multiple patterns simultaneously. Each match includes the pattern that matched along with the byte offsets of the match. ```rust use aho_corasick::{AhoCorasick, PatternID}; let patterns = &["apple", "maple", "Snapple"]; let haystack = "Nobody likes maple in their apple flavored Snapple."; let ac = AhoCorasick::new(patterns).unwrap(); let mut matches = vec![]; for mat in ac.find_iter(haystack) { matches.push((mat.pattern(), mat.start(), mat.end())); } assert_eq!(matches, vec![ (PatternID::must(1), 13, 18), (PatternID::must(0), 28, 33), (PatternID::must(2), 43, 50), ]); ``` ### Example: ASCII case insensitivity This is like the previous example, but matches `Snapple` case insensitively using `AhoCorasickBuilder`: ```rust use aho_corasick::{AhoCorasick, PatternID}; let patterns = &["apple", "maple", "snapple"]; let haystack = "Nobody likes maple in their apple flavored Snapple."; let ac = AhoCorasick::builder() .ascii_case_insensitive(true) .build(patterns) .unwrap(); let mut matches = vec![]; for mat in ac.find_iter(haystack) { matches.push((mat.pattern(), mat.start(), mat.end())); } assert_eq!(matches, vec![ (PatternID::must(1), 13, 18), (PatternID::must(0), 28, 33), (PatternID::must(2), 43, 50), ]); ``` ### Example: replacing matches in a stream This example shows how to execute a search and replace on a stream without loading the entire stream into memory first. ```rust,ignore use aho_corasick::AhoCorasick; let patterns = &["fox", "brown", "quick"]; let replace_with = &["sloth", "grey", "slow"]; // In a real example, these might be `std::fs::File`s instead. All you need to // do is supply a pair of `std::io::Read` and `std::io::Write` implementations. let rdr = "The quick brown fox."; let mut wtr = vec![]; let ac = AhoCorasick::new(patterns).unwrap(); ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with) .expect("stream_replace_all failed"); assert_eq!(b"The slow grey sloth.".to_vec(), wtr); ``` ### Example: finding the leftmost first match In the textbook description of Aho-Corasick, its formulation is typically structured such that it reports all possible matches, even when they overlap with another. In many cases, overlapping matches may not be desired, such as the case of finding all successive non-overlapping matches like you might with a standard regular expression. Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do this doesn't always work in the expected way, since it will report matches as soon as they are seen. For example, consider matching the regex `Samwise|Sam` against the text `Samwise`. Most regex engines (that are Perl-like, or non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick algorithm modified for reporting non-overlapping matches will report `Sam`. A novel contribution of this library is the ability to change the match semantics of Aho-Corasick (without additional search time overhead) such that `Samwise` is reported instead. For example, here's the standard approach: ```rust use aho_corasick::AhoCorasick; let patterns = &["Samwise", "Sam"]; let haystack = "Samwise"; let ac = AhoCorasick::new(patterns).unwrap(); let mat = ac.find(haystack).expect("should have a match"); assert_eq!("Sam", &haystack[mat.start()..mat.end()]); ``` And now here's the leftmost-first version, which matches how a Perl-like regex will work: ```rust use aho_corasick::{AhoCorasick, MatchKind}; let patterns = &["Samwise", "Sam"]; let haystack = "Samwise"; let ac = AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) .build(patterns) .unwrap(); let mat = ac.find(haystack).expect("should have a match"); assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); ``` In addition to leftmost-first semantics, this library also supports leftmost-longest semantics, which match the POSIX behavior of a regular expression alternation. See `MatchKind` in the docs for more details. ### Minimum Rust version policy This crate's minimum supported `rustc` version is `1.60.0`. The current policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if `crate 1.0` requires Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust 1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum version of Rust. In general, this crate will be conservative with respect to the minimum supported version of Rust. ### FFI bindings * [G-Research/ahocorasick_rs](https://github.com/G-Research/ahocorasick_rs/) is a Python wrapper for this library. * [tmikus/ahocorasick_rs](https://github.com/tmikus/ahocorasick_rs) is a Go wrapper for this library. aho-corasick-1.1.3/UNLICENSE000064400000000000000000000022731046102023000134450ustar 00000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to aho-corasick-1.1.3/rustfmt.toml000064400000000000000000000000541046102023000145710ustar 00000000000000max_width = 79 use_small_heuristics = "max" aho-corasick-1.1.3/src/ahocorasick.rs000064400000000000000000003220011046102023000156120ustar 00000000000000use core::{ fmt::Debug, panic::{RefUnwindSafe, UnwindSafe}, }; use alloc::{string::String, sync::Arc, vec::Vec}; use crate::{ automaton::{self, Automaton, OverlappingState}, dfa, nfa::{contiguous, noncontiguous}, util::{ error::{BuildError, MatchError}, prefilter::Prefilter, primitives::{PatternID, StateID}, search::{Anchored, Input, Match, MatchKind, StartKind}, }, }; /// An automaton for searching multiple strings in linear time. /// /// The `AhoCorasick` type supports a few basic ways of constructing an /// automaton, with the default being [`AhoCorasick::new`]. However, there /// are a fair number of configurable options that can be set by using /// [`AhoCorasickBuilder`] instead. Such options include, but are not limited /// to, how matches are determined, simple case insensitivity, whether to use a /// DFA or not and various knobs for controlling the space-vs-time trade offs /// taken when building the automaton. /// /// # Resource usage /// /// Aho-Corasick automatons are always constructed in `O(p)` time, where /// `p` is the combined length of all patterns being searched. With that /// said, building an automaton can be fairly costly because of high constant /// factors, particularly when enabling the [DFA](AhoCorasickKind::DFA) option /// with [`AhoCorasickBuilder::kind`]. For this reason, it's generally a good /// idea to build an automaton once and reuse it as much as possible. /// /// Aho-Corasick automatons can also use a fair bit of memory. To get /// a concrete idea of how much memory is being used, try using the /// [`AhoCorasick::memory_usage`] method. /// /// To give a quick idea of the differences between Aho-Corasick /// implementations and their resource usage, here's a sample of construction /// times and heap memory used after building an automaton from 100,000 /// randomly selected titles from Wikipedia: /// /// * 99MB for a [`noncontiguous::NFA`] in 240ms. /// * 21MB for a [`contiguous::NFA`] in 275ms. /// * 1.6GB for a [`dfa::DFA`] in 1.88s. /// /// (Note that the memory usage above reflects the size of each automaton and /// not peak memory usage. For example, building a contiguous NFA requires /// first building a noncontiguous NFA. Once the contiguous NFA is built, the /// noncontiguous NFA is freed.) /// /// This experiment very strongly argues that a contiguous NFA is often the /// best balance in terms of resource usage. It takes a little longer to build, /// but its memory usage is quite small. Its search speed (not listed) is /// also often faster than a noncontiguous NFA, but a little slower than a /// DFA. Indeed, when no specific [`AhoCorasickKind`] is used (which is the /// default), a contiguous NFA is used in most cases. /// /// The only "catch" to using a contiguous NFA is that, because of its variety /// of compression tricks, it may not be able to support automatons as large as /// what the noncontiguous NFA supports. In which case, building a contiguous /// NFA will fail and (by default) `AhoCorasick` will automatically fall /// back to a noncontiguous NFA. (This typically only happens when building /// automatons from millions of patterns.) Otherwise, the small additional time /// for building a contiguous NFA is almost certainly worth it. /// /// # Cloning /// /// The `AhoCorasick` type uses thread safe reference counting internally. It /// is guaranteed that it is cheap to clone. /// /// # Search configuration /// /// Most of the search routines accept anything that can be cheaply converted /// to an [`Input`]. This includes `&[u8]`, `&str` and `Input` itself. /// /// # Construction failure /// /// It is generally possible for building an Aho-Corasick automaton to fail. /// Construction can fail in generally one way: when the inputs provided are /// too big. Whether that's a pattern that is too long, too many patterns /// or some combination of both. A first approximation for the scale at which /// construction can fail is somewhere around "millions of patterns." /// /// For that reason, if you're building an Aho-Corasick automaton from /// untrusted input (or input that doesn't have any reasonable bounds on its /// size), then it is strongly recommended to handle the possibility of an /// error. /// /// If you're constructing an Aho-Corasick automaton from static or trusted /// data, then it is likely acceptable to panic (by calling `unwrap()` or /// `expect()`) if construction fails. /// /// # Fallibility /// /// The `AhoCorasick` type provides a number of methods for searching, as one /// might expect. Depending on how the Aho-Corasick automaton was built and /// depending on the search configuration, it is possible for a search to /// return an error. Since an error is _never_ dependent on the actual contents /// of the haystack, this type provides both infallible and fallible methods /// for searching. The infallible methods panic if an error occurs, and can be /// used for convenience and when you know the search will never return an /// error. /// /// For example, the [`AhoCorasick::find_iter`] method is the infallible /// version of the [`AhoCorasick::try_find_iter`] method. /// /// Examples of errors that can occur: /// /// * Running a search that requires [`MatchKind::Standard`] semantics (such /// as a stream or overlapping search) with an automaton that was built with /// [`MatchKind::LeftmostFirst`] or [`MatchKind::LeftmostLongest`] semantics. /// * Running an anchored search with an automaton that only supports /// unanchored searches. (By default, `AhoCorasick` only supports unanchored /// searches. But this can be toggled with [`AhoCorasickBuilder::start_kind`].) /// * Running an unanchored search with an automaton that only supports /// anchored searches. /// /// The common thread between the different types of errors is that they are /// all rooted in the automaton construction and search configurations. If /// those configurations are a static property of your program, then it is /// reasonable to call infallible routines since you know an error will never /// occur. And if one _does_ occur, then it's a bug in your program. /// /// To re-iterate, if the patterns, build or search configuration come from /// user or untrusted data, then you should handle errors at build or search /// time. If only the haystack comes from user or untrusted data, then there /// should be no need to handle errors anywhere and it is generally encouraged /// to `unwrap()` (or `expect()`) both build and search time calls. /// /// # Examples /// /// This example shows how to search for occurrences of multiple patterns /// simultaneously in a case insensitive fashion. Each match includes the /// pattern that matched along with the byte offsets of the match. /// /// ``` /// use aho_corasick::{AhoCorasick, PatternID}; /// /// let patterns = &["apple", "maple", "snapple"]; /// let haystack = "Nobody likes maple in their apple flavored Snapple."; /// /// let ac = AhoCorasick::builder() /// .ascii_case_insensitive(true) /// .build(patterns) /// .unwrap(); /// let mut matches = vec![]; /// for mat in ac.find_iter(haystack) { /// matches.push((mat.pattern(), mat.start(), mat.end())); /// } /// assert_eq!(matches, vec![ /// (PatternID::must(1), 13, 18), /// (PatternID::must(0), 28, 33), /// (PatternID::must(2), 43, 50), /// ]); /// ``` /// /// This example shows how to replace matches with some other string: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let patterns = &["fox", "brown", "quick"]; /// let haystack = "The quick brown fox."; /// let replace_with = &["sloth", "grey", "slow"]; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let result = ac.replace_all(haystack, replace_with); /// assert_eq!(result, "The slow grey sloth."); /// ``` #[derive(Clone)] pub struct AhoCorasick { /// The underlying Aho-Corasick automaton. It's one of /// nfa::noncontiguous::NFA, nfa::contiguous::NFA or dfa::DFA. aut: Arc, /// The specific Aho-Corasick kind chosen. This makes it possible to /// inspect any `AhoCorasick` and know what kind of search strategy it /// uses. kind: AhoCorasickKind, /// The start kind of this automaton as configured by the caller. /// /// We don't really *need* to put this here, since the underlying automaton /// will correctly return errors if the caller requests an unsupported /// search type. But we do keep this here for API behavior consistency. /// Namely, the NFAs in this crate support both unanchored and anchored /// searches unconditionally. There's no way to disable one or the other. /// They always both work. But the DFA in this crate specifically only /// supports both unanchored and anchored searches if it's configured to /// do so. Why? Because for the DFA, supporting both essentially requires /// two copies of the transition table: one generated by following failure /// transitions from the original NFA and one generated by not following /// those failure transitions. /// /// So why record the start kind here? Well, consider what happens /// when no specific 'AhoCorasickKind' is selected by the caller and /// 'StartKind::Unanchored' is used (both are the default). It *might* /// result in using a DFA or it might pick an NFA. If it picks an NFA, the /// caller would then be able to run anchored searches, even though the /// caller only asked for support for unanchored searches. Maybe that's /// fine, but what if the DFA was chosen instead? Oops, the caller would /// get an error. /// /// Basically, it seems bad to return an error or not based on some /// internal implementation choice. So we smooth things out and ensure /// anchored searches *always* report an error when only unanchored support /// was asked for (and vice versa), even if the underlying automaton /// supports it. start_kind: StartKind, } /// Convenience constructors for an Aho-Corasick searcher. To configure the /// searcher, use an [`AhoCorasickBuilder`] instead. impl AhoCorasick { /// Create a new Aho-Corasick automaton using the default configuration. /// /// The default configuration optimizes for less space usage, but at the /// expense of longer search times. To change the configuration, use /// [`AhoCorasickBuilder`]. /// /// This uses the default [`MatchKind::Standard`] match semantics, which /// reports a match as soon as it is found. This corresponds to the /// standard match semantics supported by textbook descriptions of the /// Aho-Corasick algorithm. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasick, PatternID}; /// /// let ac = AhoCorasick::new(&["foo", "bar", "baz"]).unwrap(); /// assert_eq!( /// Some(PatternID::must(1)), /// ac.find("xxx bar xxx").map(|m| m.pattern()), /// ); /// ``` pub fn new(patterns: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, { AhoCorasickBuilder::new().build(patterns) } /// A convenience method for returning a new Aho-Corasick builder. /// /// This usually permits one to just import the `AhoCorasick` type. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasick, Match, MatchKind}; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(&["samwise", "sam"]) /// .unwrap(); /// assert_eq!(Some(Match::must(0, 0..7)), ac.find("samwise")); /// ``` pub fn builder() -> AhoCorasickBuilder { AhoCorasickBuilder::new() } } /// Infallible search routines. These APIs panic when the underlying search /// would otherwise fail. Infallible routines are useful because the errors are /// a result of both search-time configuration and what configuration is used /// to build the Aho-Corasick searcher. Both of these things are not usually /// the result of user input, and thus, an error is typically indicative of a /// programmer error. In cases where callers want errors instead of panics, use /// the corresponding `try` method in the section below. impl AhoCorasick { /// Returns true if and only if this automaton matches the haystack at any /// position. /// /// `input` may be any type that is cheaply convertible to an `Input`. This /// includes, but is not limited to, `&str` and `&[u8]`. /// /// Aside from convenience, when `AhoCorasick` was built with /// leftmost-first or leftmost-longest semantics, this might result in a /// search that visits less of the haystack than [`AhoCorasick::find`] /// would otherwise. (For standard semantics, matches are always /// immediately returned once they are seen, so there is no way for this to /// do less work in that case.) /// /// Note that there is no corresponding fallible routine for this method. /// If you need a fallible version of this, then [`AhoCorasick::try_find`] /// can be used with [`Input::earliest`] enabled. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&[ /// "foo", "bar", "quux", "baz", /// ]).unwrap(); /// assert!(ac.is_match("xxx bar xxx")); /// assert!(!ac.is_match("xxx qux xxx")); /// ``` pub fn is_match<'h, I: Into>>(&self, input: I) -> bool { self.aut .try_find(&input.into().earliest(true)) .expect("AhoCorasick::try_find is not expected to fail") .is_some() } /// Returns the location of the first match according to the match /// semantics that this automaton was constructed with. /// /// `input` may be any type that is cheaply convertible to an `Input`. This /// includes, but is not limited to, `&str` and `&[u8]`. /// /// This is the infallible version of [`AhoCorasick::try_find`]. /// /// # Panics /// /// This panics when [`AhoCorasick::try_find`] would return an error. /// /// # Examples /// /// Basic usage, with standard semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::Standard) // default, not necessary /// .build(patterns) /// .unwrap(); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("b", &haystack[mat.start()..mat.end()]); /// ``` /// /// Now with leftmost-first semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); /// ``` /// /// And finally, leftmost-longest semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostLongest) /// .build(patterns) /// .unwrap(); /// let mat = ac.find(haystack).expect("should have a match"); /// ``` /// /// # Example: configuring a search /// /// Because this method accepts anything that can be turned into an /// [`Input`], it's possible to provide an `Input` directly in order to /// configure the search. In this example, we show how to use the /// `earliest` option to force the search to return as soon as it knows /// a match has occurred. /// /// ``` /// use aho_corasick::{AhoCorasick, Input, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostLongest) /// .build(patterns) /// .unwrap(); /// let mat = ac.find(Input::new(haystack).earliest(true)) /// .expect("should have a match"); /// // The correct leftmost-longest match here is 'abcd', but since we /// // told the search to quit as soon as it knows a match has occurred, /// // we get a different match back. /// assert_eq!("b", &haystack[mat.start()..mat.end()]); /// ``` pub fn find<'h, I: Into>>(&self, input: I) -> Option { self.try_find(input) .expect("AhoCorasick::try_find is not expected to fail") } /// Returns the location of the first overlapping match in the given /// input with respect to the current state of the underlying searcher. /// /// `input` may be any type that is cheaply convertible to an `Input`. This /// includes, but is not limited to, `&str` and `&[u8]`. /// /// Overlapping searches do not report matches in their return value. /// Instead, matches can be accessed via [`OverlappingState::get_match`] /// after a search call. /// /// This is the infallible version of /// [`AhoCorasick::try_find_overlapping`]. /// /// # Panics /// /// This panics when [`AhoCorasick::try_find_overlapping`] would /// return an error. For example, when the Aho-Corasick searcher /// doesn't support overlapping searches. (Only searchers built with /// [`MatchKind::Standard`] semantics support overlapping searches.) /// /// # Example /// /// This shows how we can repeatedly call an overlapping search without /// ever needing to explicitly re-slice the haystack. Overlapping search /// works this way because searches depend on state saved during the /// previous search. /// /// ``` /// use aho_corasick::{ /// automaton::OverlappingState, /// AhoCorasick, Input, Match, /// }; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let mut state = OverlappingState::start(); /// /// ac.find_overlapping(haystack, &mut state); /// assert_eq!(Some(Match::must(2, 0..3)), state.get_match()); /// /// ac.find_overlapping(haystack, &mut state); /// assert_eq!(Some(Match::must(0, 0..6)), state.get_match()); /// /// ac.find_overlapping(haystack, &mut state); /// assert_eq!(Some(Match::must(2, 11..14)), state.get_match()); /// /// ac.find_overlapping(haystack, &mut state); /// assert_eq!(Some(Match::must(2, 22..25)), state.get_match()); /// /// ac.find_overlapping(haystack, &mut state); /// assert_eq!(Some(Match::must(0, 22..28)), state.get_match()); /// /// ac.find_overlapping(haystack, &mut state); /// assert_eq!(Some(Match::must(1, 22..31)), state.get_match()); /// /// // No more match matches to be found. /// ac.find_overlapping(haystack, &mut state); /// assert_eq!(None, state.get_match()); /// ``` pub fn find_overlapping<'h, I: Into>>( &self, input: I, state: &mut OverlappingState, ) { self.try_find_overlapping(input, state).expect( "AhoCorasick::try_find_overlapping is not expected to fail", ) } /// Returns an iterator of non-overlapping matches, using the match /// semantics that this automaton was constructed with. /// /// `input` may be any type that is cheaply convertible to an `Input`. This /// includes, but is not limited to, `&str` and `&[u8]`. /// /// This is the infallible version of [`AhoCorasick::try_find_iter`]. /// /// # Panics /// /// This panics when [`AhoCorasick::try_find_iter`] would return an error. /// /// # Examples /// /// Basic usage, with standard semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind, PatternID}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::Standard) // default, not necessary /// .build(patterns) /// .unwrap(); /// let matches: Vec = ac /// .find_iter(haystack) /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![ /// PatternID::must(2), /// PatternID::must(2), /// PatternID::must(2), /// ], matches); /// ``` /// /// Now with leftmost-first semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind, PatternID}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let matches: Vec = ac /// .find_iter(haystack) /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![ /// PatternID::must(0), /// PatternID::must(2), /// PatternID::must(0), /// ], matches); /// ``` /// /// And finally, leftmost-longest semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind, PatternID}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostLongest) /// .build(patterns) /// .unwrap(); /// let matches: Vec = ac /// .find_iter(haystack) /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![ /// PatternID::must(0), /// PatternID::must(2), /// PatternID::must(1), /// ], matches); /// ``` pub fn find_iter<'a, 'h, I: Into>>( &'a self, input: I, ) -> FindIter<'a, 'h> { self.try_find_iter(input) .expect("AhoCorasick::try_find_iter is not expected to fail") } /// Returns an iterator of overlapping matches. Stated differently, this /// returns an iterator of all possible matches at every position. /// /// `input` may be any type that is cheaply convertible to an `Input`. This /// includes, but is not limited to, `&str` and `&[u8]`. /// /// This is the infallible version of /// [`AhoCorasick::try_find_overlapping_iter`]. /// /// # Panics /// /// This panics when `AhoCorasick::try_find_overlapping_iter` would return /// an error. For example, when the Aho-Corasick searcher is built with /// either leftmost-first or leftmost-longest match semantics. Stated /// differently, overlapping searches require one to build the searcher /// with [`MatchKind::Standard`] (it is the default). /// /// # Example: basic usage /// /// ``` /// use aho_corasick::{AhoCorasick, PatternID}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let matches: Vec = ac /// .find_overlapping_iter(haystack) /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![ /// PatternID::must(2), /// PatternID::must(0), /// PatternID::must(2), /// PatternID::must(2), /// PatternID::must(0), /// PatternID::must(1), /// ], matches); /// ``` pub fn find_overlapping_iter<'a, 'h, I: Into>>( &'a self, input: I, ) -> FindOverlappingIter<'a, 'h> { self.try_find_overlapping_iter(input).expect( "AhoCorasick::try_find_overlapping_iter is not expected to fail", ) } /// Replace all matches with a corresponding value in the `replace_with` /// slice given. Matches correspond to the same matches as reported by /// [`AhoCorasick::find_iter`]. /// /// Replacements are determined by the index of the matching pattern. /// For example, if the pattern with index `2` is found, then it is /// replaced by `replace_with[2]`. /// /// This is the infallible version of [`AhoCorasick::try_replace_all`]. /// /// # Panics /// /// This panics when [`AhoCorasick::try_replace_all`] would return an /// error. /// /// This also panics when `replace_with.len()` does not equal /// [`AhoCorasick::patterns_len`]. /// /// # Example: basic usage /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let result = ac.replace_all(haystack, &["x", "y", "z"]); /// assert_eq!("x the z to the xage", result); /// ``` pub fn replace_all(&self, haystack: &str, replace_with: &[B]) -> String where B: AsRef, { self.try_replace_all(haystack, replace_with) .expect("AhoCorasick::try_replace_all is not expected to fail") } /// Replace all matches using raw bytes with a corresponding value in the /// `replace_with` slice given. Matches correspond to the same matches as /// reported by [`AhoCorasick::find_iter`]. /// /// Replacements are determined by the index of the matching pattern. /// For example, if the pattern with index `2` is found, then it is /// replaced by `replace_with[2]`. /// /// This is the infallible version of /// [`AhoCorasick::try_replace_all_bytes`]. /// /// # Panics /// /// This panics when [`AhoCorasick::try_replace_all_bytes`] would return an /// error. /// /// This also panics when `replace_with.len()` does not equal /// [`AhoCorasick::patterns_len`]. /// /// # Example: basic usage /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = b"append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]); /// assert_eq!(b"x the z to the xage".to_vec(), result); /// ``` pub fn replace_all_bytes( &self, haystack: &[u8], replace_with: &[B], ) -> Vec where B: AsRef<[u8]>, { self.try_replace_all_bytes(haystack, replace_with) .expect("AhoCorasick::try_replace_all_bytes should not fail") } /// Replace all matches using a closure called on each match. /// Matches correspond to the same matches as reported by /// [`AhoCorasick::find_iter`]. /// /// The closure accepts three parameters: the match found, the text of /// the match and a string buffer with which to write the replaced text /// (if any). If the closure returns `true`, then it continues to the next /// match. If the closure returns `false`, then searching is stopped. /// /// Note that any matches with boundaries that don't fall on a valid UTF-8 /// boundary are silently skipped. /// /// This is the infallible version of /// [`AhoCorasick::try_replace_all_with`]. /// /// # Panics /// /// This panics when [`AhoCorasick::try_replace_all_with`] would return an /// error. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let mut result = String::new(); /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { /// dst.push_str(&mat.pattern().as_usize().to_string()); /// true /// }); /// assert_eq!("0 the 2 to the 0age", result); /// ``` /// /// Stopping the replacement by returning `false` (continued from the /// example above): /// /// ``` /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID}; /// # let patterns = &["append", "appendage", "app"]; /// # let haystack = "append the app to the appendage"; /// # let ac = AhoCorasick::builder() /// # .match_kind(MatchKind::LeftmostFirst) /// # .build(patterns) /// # .unwrap(); /// let mut result = String::new(); /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { /// dst.push_str(&mat.pattern().as_usize().to_string()); /// mat.pattern() != PatternID::must(2) /// }); /// assert_eq!("0 the 2 to the appendage", result); /// ``` pub fn replace_all_with( &self, haystack: &str, dst: &mut String, replace_with: F, ) where F: FnMut(&Match, &str, &mut String) -> bool, { self.try_replace_all_with(haystack, dst, replace_with) .expect("AhoCorasick::try_replace_all_with should not fail") } /// Replace all matches using raw bytes with a closure called on each /// match. Matches correspond to the same matches as reported by /// [`AhoCorasick::find_iter`]. /// /// The closure accepts three parameters: the match found, the text of /// the match and a byte buffer with which to write the replaced text /// (if any). If the closure returns `true`, then it continues to the next /// match. If the closure returns `false`, then searching is stopped. /// /// This is the infallible version of /// [`AhoCorasick::try_replace_all_with_bytes`]. /// /// # Panics /// /// This panics when [`AhoCorasick::try_replace_all_with_bytes`] would /// return an error. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = b"append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let mut result = vec![]; /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { /// dst.extend(mat.pattern().as_usize().to_string().bytes()); /// true /// }); /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); /// ``` /// /// Stopping the replacement by returning `false` (continued from the /// example above): /// /// ``` /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID}; /// # let patterns = &["append", "appendage", "app"]; /// # let haystack = b"append the app to the appendage"; /// # let ac = AhoCorasick::builder() /// # .match_kind(MatchKind::LeftmostFirst) /// # .build(patterns) /// # .unwrap(); /// let mut result = vec![]; /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { /// dst.extend(mat.pattern().as_usize().to_string().bytes()); /// mat.pattern() != PatternID::must(2) /// }); /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result); /// ``` pub fn replace_all_with_bytes( &self, haystack: &[u8], dst: &mut Vec, replace_with: F, ) where F: FnMut(&Match, &[u8], &mut Vec) -> bool, { self.try_replace_all_with_bytes(haystack, dst, replace_with) .expect("AhoCorasick::try_replace_all_with_bytes should not fail") } /// Returns an iterator of non-overlapping matches in the given /// stream. Matches correspond to the same matches as reported by /// [`AhoCorasick::find_iter`]. /// /// The matches yielded by this iterator use absolute position offsets in /// the stream given, where the first byte has index `0`. Matches are /// yieled until the stream is exhausted. /// /// Each item yielded by the iterator is an `Result`, where an error is yielded if there was a problem /// reading from the reader given. /// /// When searching a stream, an internal buffer is used. Therefore, callers /// should avoiding providing a buffered reader, if possible. /// /// This is the infallible version of /// [`AhoCorasick::try_stream_find_iter`]. Note that both methods return /// iterators that produce `Result` values. The difference is that this /// routine panics if _construction_ of the iterator failed. The `Result` /// values yield by the iterator come from whether the given reader returns /// an error or not during the search. /// /// # Memory usage /// /// In general, searching streams will use a constant amount of memory for /// its internal buffer. The one requirement is that the internal buffer /// must be at least the size of the longest possible match. In most use /// cases, the default buffer size will be much larger than any individual /// match. /// /// # Panics /// /// This panics when [`AhoCorasick::try_stream_find_iter`] would return /// an error. For example, when the Aho-Corasick searcher doesn't support /// stream searches. (Only searchers built with [`MatchKind::Standard`] /// semantics support stream searches.) /// /// # Example: basic usage /// /// ``` /// use aho_corasick::{AhoCorasick, PatternID}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let mut matches = vec![]; /// for result in ac.stream_find_iter(haystack.as_bytes()) { /// let mat = result?; /// matches.push(mat.pattern()); /// } /// assert_eq!(vec![ /// PatternID::must(2), /// PatternID::must(2), /// PatternID::must(2), /// ], matches); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "std")] pub fn stream_find_iter<'a, R: std::io::Read>( &'a self, rdr: R, ) -> StreamFindIter<'a, R> { self.try_stream_find_iter(rdr) .expect("AhoCorasick::try_stream_find_iter should not fail") } } /// Fallible search routines. These APIs return an error in cases where the /// infallible routines would panic. impl AhoCorasick { /// Returns the location of the first match according to the match /// semantics that this automaton was constructed with, and according /// to the given `Input` configuration. /// /// This is the fallible version of [`AhoCorasick::find`]. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the given `Input` configuration. /// /// For example, if the Aho-Corasick searcher only supports anchored /// searches or only supports unanchored searches, then providing an /// `Input` that requests an anchored (or unanchored) search when it isn't /// supported would result in an error. /// /// # Example: leftmost-first searching /// /// Basic usage with leftmost-first semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind, Input}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "foo abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let mat = ac.try_find(haystack)?.expect("should have a match"); /// assert_eq!("abc", &haystack[mat.span()]); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: anchored leftmost-first searching /// /// This shows how to anchor the search, so that even if the haystack /// contains a match somewhere, a match won't be reported unless one can /// be found that starts at the beginning of the search: /// /// ``` /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "foo abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .start_kind(StartKind::Anchored) /// .build(patterns) /// .unwrap(); /// let input = Input::new(haystack).anchored(Anchored::Yes); /// assert_eq!(None, ac.try_find(input)?); /// /// # Ok::<(), Box>(()) /// ``` /// /// If the beginning of the search is changed to where a match begins, then /// it will be found: /// /// ``` /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "foo abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .start_kind(StartKind::Anchored) /// .build(patterns) /// .unwrap(); /// let input = Input::new(haystack).range(4..).anchored(Anchored::Yes); /// let mat = ac.try_find(input)?.expect("should have a match"); /// assert_eq!("abc", &haystack[mat.span()]); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: earliest leftmost-first searching /// /// This shows how to run an "earliest" search even when the Aho-Corasick /// searcher was compiled with leftmost-first match semantics. In this /// case, the search is stopped as soon as it is known that a match has /// occurred, even if it doesn't correspond to the leftmost-first match. /// /// ``` /// use aho_corasick::{AhoCorasick, Input, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "foo abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let input = Input::new(haystack).earliest(true); /// let mat = ac.try_find(input)?.expect("should have a match"); /// assert_eq!("b", &haystack[mat.span()]); /// /// # Ok::<(), Box>(()) /// ``` pub fn try_find<'h, I: Into>>( &self, input: I, ) -> Result, MatchError> { let input = input.into(); enforce_anchored_consistency(self.start_kind, input.get_anchored())?; self.aut.try_find(&input) } /// Returns the location of the first overlapping match in the given /// input with respect to the current state of the underlying searcher. /// /// Overlapping searches do not report matches in their return value. /// Instead, matches can be accessed via [`OverlappingState::get_match`] /// after a search call. /// /// This is the fallible version of [`AhoCorasick::find_overlapping`]. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the given `Input` configuration or if overlapping search is not /// supported. /// /// One example is that only Aho-Corasicker searchers built with /// [`MatchKind::Standard`] semantics support overlapping searches. Using /// any other match semantics will result in this returning an error. /// /// # Example: basic usage /// /// This shows how we can repeatedly call an overlapping search without /// ever needing to explicitly re-slice the haystack. Overlapping search /// works this way because searches depend on state saved during the /// previous search. /// /// ``` /// use aho_corasick::{ /// automaton::OverlappingState, /// AhoCorasick, Input, Match, /// }; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let mut state = OverlappingState::start(); /// /// ac.try_find_overlapping(haystack, &mut state)?; /// assert_eq!(Some(Match::must(2, 0..3)), state.get_match()); /// /// ac.try_find_overlapping(haystack, &mut state)?; /// assert_eq!(Some(Match::must(0, 0..6)), state.get_match()); /// /// ac.try_find_overlapping(haystack, &mut state)?; /// assert_eq!(Some(Match::must(2, 11..14)), state.get_match()); /// /// ac.try_find_overlapping(haystack, &mut state)?; /// assert_eq!(Some(Match::must(2, 22..25)), state.get_match()); /// /// ac.try_find_overlapping(haystack, &mut state)?; /// assert_eq!(Some(Match::must(0, 22..28)), state.get_match()); /// /// ac.try_find_overlapping(haystack, &mut state)?; /// assert_eq!(Some(Match::must(1, 22..31)), state.get_match()); /// /// // No more match matches to be found. /// ac.try_find_overlapping(haystack, &mut state)?; /// assert_eq!(None, state.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: implementing your own overlapping iteration /// /// The previous example can be easily adapted to implement your own /// iteration by repeatedly calling `try_find_overlapping` until either /// an error occurs or no more matches are reported. /// /// This is effectively equivalent to the iterator returned by /// [`AhoCorasick::try_find_overlapping_iter`], with the only difference /// being that the iterator checks for errors before construction and /// absolves the caller of needing to check for errors on every search /// call. (Indeed, if the first `try_find_overlapping` call succeeds and /// the same `Input` is given to subsequent calls, then all subsequent /// calls are guaranteed to succeed.) /// /// ``` /// use aho_corasick::{ /// automaton::OverlappingState, /// AhoCorasick, Input, Match, /// }; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let mut state = OverlappingState::start(); /// let mut matches = vec![]; /// /// loop { /// ac.try_find_overlapping(haystack, &mut state)?; /// let mat = match state.get_match() { /// None => break, /// Some(mat) => mat, /// }; /// matches.push(mat); /// } /// let expected = vec![ /// Match::must(2, 0..3), /// Match::must(0, 0..6), /// Match::must(2, 11..14), /// Match::must(2, 22..25), /// Match::must(0, 22..28), /// Match::must(1, 22..31), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: anchored iteration /// /// The previous example can also be adapted to implement /// iteration over all anchored matches. In particular, /// [`AhoCorasick::try_find_overlapping_iter`] does not support this /// because it isn't totally clear what the match semantics ought to be. /// /// In this example, we will find all overlapping matches that start at /// the beginning of our search. /// /// ``` /// use aho_corasick::{ /// automaton::OverlappingState, /// AhoCorasick, Anchored, Input, Match, StartKind, /// }; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .start_kind(StartKind::Anchored) /// .build(patterns) /// .unwrap(); /// let input = Input::new(haystack).anchored(Anchored::Yes); /// let mut state = OverlappingState::start(); /// let mut matches = vec![]; /// /// loop { /// ac.try_find_overlapping(input.clone(), &mut state)?; /// let mat = match state.get_match() { /// None => break, /// Some(mat) => mat, /// }; /// matches.push(mat); /// } /// let expected = vec![ /// Match::must(2, 0..3), /// Match::must(0, 0..6), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` pub fn try_find_overlapping<'h, I: Into>>( &self, input: I, state: &mut OverlappingState, ) -> Result<(), MatchError> { let input = input.into(); enforce_anchored_consistency(self.start_kind, input.get_anchored())?; self.aut.try_find_overlapping(&input, state) } /// Returns an iterator of non-overlapping matches, using the match /// semantics that this automaton was constructed with. /// /// This is the fallible version of [`AhoCorasick::find_iter`]. /// /// Note that the error returned by this method occurs during construction /// of the iterator. The iterator itself yields `Match` values. That is, /// once the iterator is constructed, the iteration itself will never /// report an error. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the given `Input` configuration. /// /// For example, if the Aho-Corasick searcher only supports anchored /// searches or only supports unanchored searches, then providing an /// `Input` that requests an anchored (or unanchored) search when it isn't /// supported would result in an error. /// /// # Example: leftmost-first searching /// /// Basic usage with leftmost-first semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, Input, MatchKind, PatternID}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let matches: Vec = ac /// .try_find_iter(Input::new(haystack))? /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![ /// PatternID::must(0), /// PatternID::must(2), /// PatternID::must(0), /// ], matches); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: anchored leftmost-first searching /// /// This shows how to anchor the search, such that all matches must begin /// at the starting location of the search. For an iterator, an anchored /// search implies that all matches are adjacent. /// /// ``` /// use aho_corasick::{ /// AhoCorasick, Anchored, Input, MatchKind, PatternID, StartKind, /// }; /// /// let patterns = &["foo", "bar", "quux"]; /// let haystack = "fooquuxbar foo"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .start_kind(StartKind::Anchored) /// .build(patterns) /// .unwrap(); /// let matches: Vec = ac /// .try_find_iter(Input::new(haystack).anchored(Anchored::Yes))? /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![ /// PatternID::must(0), /// PatternID::must(2), /// PatternID::must(1), /// // The final 'foo' is not found because it is not adjacent to the /// // 'bar' match. It needs to be adjacent because our search is /// // anchored. /// ], matches); /// /// # Ok::<(), Box>(()) /// ``` pub fn try_find_iter<'a, 'h, I: Into>>( &'a self, input: I, ) -> Result, MatchError> { let input = input.into(); enforce_anchored_consistency(self.start_kind, input.get_anchored())?; Ok(FindIter(self.aut.try_find_iter(input)?)) } /// Returns an iterator of overlapping matches. /// /// This is the fallible version of [`AhoCorasick::find_overlapping_iter`]. /// /// Note that the error returned by this method occurs during construction /// of the iterator. The iterator itself yields `Match` values. That is, /// once the iterator is constructed, the iteration itself will never /// report an error. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the given `Input` configuration or does not support overlapping /// searches. /// /// One example is that only Aho-Corasicker searchers built with /// [`MatchKind::Standard`] semantics support overlapping searches. Using /// any other match semantics will result in this returning an error. /// /// # Example: basic usage /// /// ``` /// use aho_corasick::{AhoCorasick, Input, PatternID}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let matches: Vec = ac /// .try_find_overlapping_iter(Input::new(haystack))? /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![ /// PatternID::must(2), /// PatternID::must(0), /// PatternID::must(2), /// PatternID::must(2), /// PatternID::must(0), /// PatternID::must(1), /// ], matches); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: anchored overlapping search returns an error /// /// It isn't clear what the match semantics for anchored overlapping /// iterators *ought* to be, so currently an error is returned. Callers /// may use [`AhoCorasick::try_find_overlapping`] to implement their own /// semantics if desired. /// /// ``` /// use aho_corasick::{AhoCorasick, Anchored, Input, StartKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "appendappendage app"; /// /// let ac = AhoCorasick::builder() /// .start_kind(StartKind::Anchored) /// .build(patterns) /// .unwrap(); /// let input = Input::new(haystack).anchored(Anchored::Yes); /// assert!(ac.try_find_overlapping_iter(input).is_err()); /// /// # Ok::<(), Box>(()) /// ``` pub fn try_find_overlapping_iter<'a, 'h, I: Into>>( &'a self, input: I, ) -> Result, MatchError> { let input = input.into(); enforce_anchored_consistency(self.start_kind, input.get_anchored())?; Ok(FindOverlappingIter(self.aut.try_find_overlapping_iter(input)?)) } /// Replace all matches with a corresponding value in the `replace_with` /// slice given. Matches correspond to the same matches as reported by /// [`AhoCorasick::try_find_iter`]. /// /// Replacements are determined by the index of the matching pattern. /// For example, if the pattern with index `2` is found, then it is /// replaced by `replace_with[2]`. /// /// # Panics /// /// This panics when `replace_with.len()` does not equal /// [`AhoCorasick::patterns_len`]. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the default `Input` configuration. More specifically, this occurs only /// when the Aho-Corasick searcher does not support unanchored searches /// since this replacement routine always does an unanchored search. /// /// # Example: basic usage /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let result = ac.try_replace_all(haystack, &["x", "y", "z"])?; /// assert_eq!("x the z to the xage", result); /// /// # Ok::<(), Box>(()) /// ``` pub fn try_replace_all( &self, haystack: &str, replace_with: &[B], ) -> Result where B: AsRef, { enforce_anchored_consistency(self.start_kind, Anchored::No)?; self.aut.try_replace_all(haystack, replace_with) } /// Replace all matches using raw bytes with a corresponding value in the /// `replace_with` slice given. Matches correspond to the same matches as /// reported by [`AhoCorasick::try_find_iter`]. /// /// Replacements are determined by the index of the matching pattern. /// For example, if the pattern with index `2` is found, then it is /// replaced by `replace_with[2]`. /// /// This is the fallible version of [`AhoCorasick::replace_all_bytes`]. /// /// # Panics /// /// This panics when `replace_with.len()` does not equal /// [`AhoCorasick::patterns_len`]. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the default `Input` configuration. More specifically, this occurs only /// when the Aho-Corasick searcher does not support unanchored searches /// since this replacement routine always does an unanchored search. /// /// # Example: basic usage /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = b"append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let result = ac.try_replace_all_bytes(haystack, &["x", "y", "z"])?; /// assert_eq!(b"x the z to the xage".to_vec(), result); /// /// # Ok::<(), Box>(()) /// ``` pub fn try_replace_all_bytes( &self, haystack: &[u8], replace_with: &[B], ) -> Result, MatchError> where B: AsRef<[u8]>, { enforce_anchored_consistency(self.start_kind, Anchored::No)?; self.aut.try_replace_all_bytes(haystack, replace_with) } /// Replace all matches using a closure called on each match. /// Matches correspond to the same matches as reported by /// [`AhoCorasick::try_find_iter`]. /// /// The closure accepts three parameters: the match found, the text of /// the match and a string buffer with which to write the replaced text /// (if any). If the closure returns `true`, then it continues to the next /// match. If the closure returns `false`, then searching is stopped. /// /// Note that any matches with boundaries that don't fall on a valid UTF-8 /// boundary are silently skipped. /// /// This is the fallible version of [`AhoCorasick::replace_all_with`]. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the default `Input` configuration. More specifically, this occurs only /// when the Aho-Corasick searcher does not support unanchored searches /// since this replacement routine always does an unanchored search. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let mut result = String::new(); /// ac.try_replace_all_with(haystack, &mut result, |mat, _, dst| { /// dst.push_str(&mat.pattern().as_usize().to_string()); /// true /// })?; /// assert_eq!("0 the 2 to the 0age", result); /// /// # Ok::<(), Box>(()) /// ``` /// /// Stopping the replacement by returning `false` (continued from the /// example above): /// /// ``` /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID}; /// # let patterns = &["append", "appendage", "app"]; /// # let haystack = "append the app to the appendage"; /// # let ac = AhoCorasick::builder() /// # .match_kind(MatchKind::LeftmostFirst) /// # .build(patterns) /// # .unwrap(); /// let mut result = String::new(); /// ac.try_replace_all_with(haystack, &mut result, |mat, _, dst| { /// dst.push_str(&mat.pattern().as_usize().to_string()); /// mat.pattern() != PatternID::must(2) /// })?; /// assert_eq!("0 the 2 to the appendage", result); /// /// # Ok::<(), Box>(()) /// ``` pub fn try_replace_all_with( &self, haystack: &str, dst: &mut String, replace_with: F, ) -> Result<(), MatchError> where F: FnMut(&Match, &str, &mut String) -> bool, { enforce_anchored_consistency(self.start_kind, Anchored::No)?; self.aut.try_replace_all_with(haystack, dst, replace_with) } /// Replace all matches using raw bytes with a closure called on each /// match. Matches correspond to the same matches as reported by /// [`AhoCorasick::try_find_iter`]. /// /// The closure accepts three parameters: the match found, the text of /// the match and a byte buffer with which to write the replaced text /// (if any). If the closure returns `true`, then it continues to the next /// match. If the closure returns `false`, then searching is stopped. /// /// This is the fallible version of /// [`AhoCorasick::replace_all_with_bytes`]. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the default `Input` configuration. More specifically, this occurs only /// when the Aho-Corasick searcher does not support unanchored searches /// since this replacement routine always does an unanchored search. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = b"append the app to the appendage"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let mut result = vec![]; /// ac.try_replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { /// dst.extend(mat.pattern().as_usize().to_string().bytes()); /// true /// })?; /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); /// /// # Ok::<(), Box>(()) /// ``` /// /// Stopping the replacement by returning `false` (continued from the /// example above): /// /// ``` /// # use aho_corasick::{AhoCorasick, MatchKind, PatternID}; /// # let patterns = &["append", "appendage", "app"]; /// # let haystack = b"append the app to the appendage"; /// # let ac = AhoCorasick::builder() /// # .match_kind(MatchKind::LeftmostFirst) /// # .build(patterns) /// # .unwrap(); /// let mut result = vec![]; /// ac.try_replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { /// dst.extend(mat.pattern().as_usize().to_string().bytes()); /// mat.pattern() != PatternID::must(2) /// })?; /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result); /// /// # Ok::<(), Box>(()) /// ``` pub fn try_replace_all_with_bytes( &self, haystack: &[u8], dst: &mut Vec, replace_with: F, ) -> Result<(), MatchError> where F: FnMut(&Match, &[u8], &mut Vec) -> bool, { enforce_anchored_consistency(self.start_kind, Anchored::No)?; self.aut.try_replace_all_with_bytes(haystack, dst, replace_with) } /// Returns an iterator of non-overlapping matches in the given /// stream. Matches correspond to the same matches as reported by /// [`AhoCorasick::try_find_iter`]. /// /// The matches yielded by this iterator use absolute position offsets in /// the stream given, where the first byte has index `0`. Matches are /// yieled until the stream is exhausted. /// /// Each item yielded by the iterator is an `Result`, where an error is yielded if there was a problem /// reading from the reader given. /// /// When searching a stream, an internal buffer is used. Therefore, callers /// should avoiding providing a buffered reader, if possible. /// /// This is the fallible version of [`AhoCorasick::stream_find_iter`]. /// Note that both methods return iterators that produce `Result` values. /// The difference is that this routine returns an error if _construction_ /// of the iterator failed. The `Result` values yield by the iterator /// come from whether the given reader returns an error or not during the /// search. /// /// # Memory usage /// /// In general, searching streams will use a constant amount of memory for /// its internal buffer. The one requirement is that the internal buffer /// must be at least the size of the longest possible match. In most use /// cases, the default buffer size will be much larger than any individual /// match. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the default `Input` configuration. More specifically, this occurs only /// when the Aho-Corasick searcher does not support unanchored searches /// since this stream searching routine always does an unanchored search. /// /// This also returns an error if the searcher does not support stream /// searches. Only searchers built with [`MatchKind::Standard`] semantics /// support stream searches. /// /// # Example: basic usage /// /// ``` /// use aho_corasick::{AhoCorasick, PatternID}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let mut matches = vec![]; /// for result in ac.try_stream_find_iter(haystack.as_bytes())? { /// let mat = result?; /// matches.push(mat.pattern()); /// } /// assert_eq!(vec![ /// PatternID::must(2), /// PatternID::must(2), /// PatternID::must(2), /// ], matches); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "std")] pub fn try_stream_find_iter<'a, R: std::io::Read>( &'a self, rdr: R, ) -> Result, MatchError> { enforce_anchored_consistency(self.start_kind, Anchored::No)?; self.aut.try_stream_find_iter(rdr).map(StreamFindIter) } /// Search for and replace all matches of this automaton in /// the given reader, and write the replacements to the given /// writer. Matches correspond to the same matches as reported by /// [`AhoCorasick::try_find_iter`]. /// /// Replacements are determined by the index of the matching pattern. For /// example, if the pattern with index `2` is found, then it is replaced by /// `replace_with[2]`. /// /// After all matches are replaced, the writer is _not_ flushed. /// /// If there was a problem reading from the given reader or writing to the /// given writer, then the corresponding `io::Error` is returned and all /// replacement is stopped. /// /// When searching a stream, an internal buffer is used. Therefore, callers /// should avoiding providing a buffered reader, if possible. However, /// callers may want to provide a buffered writer. /// /// Note that there is currently no infallible version of this routine. /// /// # Memory usage /// /// In general, searching streams will use a constant amount of memory for /// its internal buffer. The one requirement is that the internal buffer /// must be at least the size of the longest possible match. In most use /// cases, the default buffer size will be much larger than any individual /// match. /// /// # Panics /// /// This panics when `replace_with.len()` does not equal /// [`AhoCorasick::patterns_len`]. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the default `Input` configuration. More specifically, this occurs only /// when the Aho-Corasick searcher does not support unanchored searches /// since this stream searching routine always does an unanchored search. /// /// This also returns an error if the searcher does not support stream /// searches. Only searchers built with [`MatchKind::Standard`] semantics /// support stream searches. /// /// # Example: basic usage /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let patterns = &["fox", "brown", "quick"]; /// let haystack = "The quick brown fox."; /// let replace_with = &["sloth", "grey", "slow"]; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let mut result = vec![]; /// ac.try_stream_replace_all( /// haystack.as_bytes(), /// &mut result, /// replace_with, /// )?; /// assert_eq!(b"The slow grey sloth.".to_vec(), result); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "std")] pub fn try_stream_replace_all( &self, rdr: R, wtr: W, replace_with: &[B], ) -> Result<(), std::io::Error> where R: std::io::Read, W: std::io::Write, B: AsRef<[u8]>, { enforce_anchored_consistency(self.start_kind, Anchored::No) .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; self.aut.try_stream_replace_all(rdr, wtr, replace_with) } /// Search the given reader and replace all matches of this automaton /// using the given closure. The result is written to the given /// writer. Matches correspond to the same matches as reported by /// [`AhoCorasick::try_find_iter`]. /// /// The closure accepts three parameters: the match found, the text of /// the match and the writer with which to write the replaced text (if any). /// /// After all matches are replaced, the writer is _not_ flushed. /// /// If there was a problem reading from the given reader or writing to the /// given writer, then the corresponding `io::Error` is returned and all /// replacement is stopped. /// /// When searching a stream, an internal buffer is used. Therefore, callers /// should avoiding providing a buffered reader, if possible. However, /// callers may want to provide a buffered writer. /// /// Note that there is currently no infallible version of this routine. /// /// # Memory usage /// /// In general, searching streams will use a constant amount of memory for /// its internal buffer. The one requirement is that the internal buffer /// must be at least the size of the longest possible match. In most use /// cases, the default buffer size will be much larger than any individual /// match. /// /// # Errors /// /// This returns an error when this Aho-Corasick searcher does not support /// the default `Input` configuration. More specifically, this occurs only /// when the Aho-Corasick searcher does not support unanchored searches /// since this stream searching routine always does an unanchored search. /// /// This also returns an error if the searcher does not support stream /// searches. Only searchers built with [`MatchKind::Standard`] semantics /// support stream searches. /// /// # Example: basic usage /// /// ``` /// use std::io::Write; /// use aho_corasick::AhoCorasick; /// /// let patterns = &["fox", "brown", "quick"]; /// let haystack = "The quick brown fox."; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let mut result = vec![]; /// ac.try_stream_replace_all_with( /// haystack.as_bytes(), /// &mut result, /// |mat, _, wtr| { /// wtr.write_all(mat.pattern().as_usize().to_string().as_bytes()) /// }, /// )?; /// assert_eq!(b"The 2 1 0.".to_vec(), result); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "std")] pub fn try_stream_replace_all_with( &self, rdr: R, wtr: W, replace_with: F, ) -> Result<(), std::io::Error> where R: std::io::Read, W: std::io::Write, F: FnMut(&Match, &[u8], &mut W) -> Result<(), std::io::Error>, { enforce_anchored_consistency(self.start_kind, Anchored::No) .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; self.aut.try_stream_replace_all_with(rdr, wtr, replace_with) } } /// Routines for querying information about the Aho-Corasick automaton. impl AhoCorasick { /// Returns the kind of the Aho-Corasick automaton used by this searcher. /// /// Knowing the Aho-Corasick kind is principally useful for diagnostic /// purposes. In particular, if no specific kind was given to /// [`AhoCorasickBuilder::kind`], then one is automatically chosen and /// this routine will report which one. /// /// Note that the heuristics used for choosing which `AhoCorasickKind` /// may be changed in a semver compatible release. /// /// # Examples /// /// ``` /// use aho_corasick::{AhoCorasick, AhoCorasickKind}; /// /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); /// // The specific Aho-Corasick kind chosen is not guaranteed! /// assert_eq!(AhoCorasickKind::DFA, ac.kind()); /// ``` pub fn kind(&self) -> AhoCorasickKind { self.kind } /// Returns the type of starting search configuration supported by this /// Aho-Corasick automaton. /// /// # Examples /// /// ``` /// use aho_corasick::{AhoCorasick, StartKind}; /// /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); /// assert_eq!(StartKind::Unanchored, ac.start_kind()); /// ``` pub fn start_kind(&self) -> StartKind { self.start_kind } /// Returns the match kind used by this automaton. /// /// The match kind is important because it determines what kinds of /// matches are returned. Also, some operations (such as overlapping /// search and stream searching) are only supported when using the /// [`MatchKind::Standard`] match kind. /// /// # Examples /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); /// assert_eq!(MatchKind::Standard, ac.match_kind()); /// ``` pub fn match_kind(&self) -> MatchKind { self.aut.match_kind() } /// Returns the length of the shortest pattern matched by this automaton. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); /// assert_eq!(3, ac.min_pattern_len()); /// ``` /// /// Note that an `AhoCorasick` automaton has a minimum length of `0` if /// and only if it can match the empty string: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&["foo", "", "quux", "baz"]).unwrap(); /// assert_eq!(0, ac.min_pattern_len()); /// ``` pub fn min_pattern_len(&self) -> usize { self.aut.min_pattern_len() } /// Returns the length of the longest pattern matched by this automaton. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&["foo", "bar", "quux", "baz"]).unwrap(); /// assert_eq!(4, ac.max_pattern_len()); /// ``` pub fn max_pattern_len(&self) -> usize { self.aut.max_pattern_len() } /// Return the total number of patterns matched by this automaton. /// /// This includes patterns that may never participate in a match. For /// example, if [`MatchKind::LeftmostFirst`] match semantics are used, and /// the patterns `Sam` and `Samwise` were used to build the automaton (in /// that order), then `Samwise` can never participate in a match because /// `Sam` will always take priority. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&["foo", "bar", "baz"]).unwrap(); /// assert_eq!(3, ac.patterns_len()); /// ``` pub fn patterns_len(&self) -> usize { self.aut.patterns_len() } /// Returns the approximate total amount of heap used by this automaton, in /// units of bytes. /// /// # Examples /// /// This example shows the difference in heap usage between a few /// configurations: /// /// ``` /// # if !cfg!(target_pointer_width = "64") { return; } /// use aho_corasick::{AhoCorasick, AhoCorasickKind, MatchKind}; /// /// let ac = AhoCorasick::builder() /// .kind(None) // default /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) /// .unwrap(); /// assert_eq!(5_632, ac.memory_usage()); /// /// let ac = AhoCorasick::builder() /// .kind(None) // default /// .ascii_case_insensitive(true) /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) /// .unwrap(); /// assert_eq!(11_136, ac.memory_usage()); /// /// let ac = AhoCorasick::builder() /// .kind(Some(AhoCorasickKind::NoncontiguousNFA)) /// .ascii_case_insensitive(true) /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) /// .unwrap(); /// assert_eq!(10_879, ac.memory_usage()); /// /// let ac = AhoCorasick::builder() /// .kind(Some(AhoCorasickKind::ContiguousNFA)) /// .ascii_case_insensitive(true) /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) /// .unwrap(); /// assert_eq!(2_584, ac.memory_usage()); /// /// let ac = AhoCorasick::builder() /// .kind(Some(AhoCorasickKind::DFA)) /// .ascii_case_insensitive(true) /// .build(&["foobar", "bruce", "triskaidekaphobia", "springsteen"]) /// .unwrap(); /// // While this shows the DFA being the biggest here by a small margin, /// // don't let the difference fool you. With such a small number of /// // patterns, the difference is small, but a bigger number of patterns /// // will reveal that the rate of growth of the DFA is far bigger than /// // the NFAs above. For a large number of patterns, it is easy for the /// // DFA to take an order of magnitude more heap space (or more!). /// assert_eq!(11_136, ac.memory_usage()); /// ``` pub fn memory_usage(&self) -> usize { self.aut.memory_usage() } } // We provide a manual debug impl so that we don't include the 'start_kind', // principally because it's kind of weird to do so and because it screws with // the carefully curated debug output for the underlying automaton. impl core::fmt::Debug for AhoCorasick { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_tuple("AhoCorasick").field(&self.aut).finish() } } /// An iterator of non-overlapping matches in a particular haystack. /// /// This iterator yields matches according to the [`MatchKind`] used by this /// automaton. /// /// This iterator is constructed via the [`AhoCorasick::find_iter`] and /// [`AhoCorasick::try_find_iter`] methods. /// /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. /// /// The lifetime `'h` refers to the lifetime of the haystack being searched. #[derive(Debug)] pub struct FindIter<'a, 'h>(automaton::FindIter<'a, 'h, Arc>); impl<'a, 'h> Iterator for FindIter<'a, 'h> { type Item = Match; #[inline] fn next(&mut self) -> Option { self.0.next() } } /// An iterator of overlapping matches in a particular haystack. /// /// This iterator will report all possible matches in a particular haystack, /// even when the matches overlap. /// /// This iterator is constructed via the [`AhoCorasick::find_overlapping_iter`] /// and [`AhoCorasick::try_find_overlapping_iter`] methods. /// /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. /// /// The lifetime `'h` refers to the lifetime of the haystack being searched. #[derive(Debug)] pub struct FindOverlappingIter<'a, 'h>( automaton::FindOverlappingIter<'a, 'h, Arc>, ); impl<'a, 'h> Iterator for FindOverlappingIter<'a, 'h> { type Item = Match; #[inline] fn next(&mut self) -> Option { self.0.next() } } /// An iterator that reports Aho-Corasick matches in a stream. /// /// This iterator yields elements of type `Result`, /// where an error is reported if there was a problem reading from the /// underlying stream. The iterator terminates only when the underlying stream /// reaches `EOF`. /// /// This iterator is constructed via the [`AhoCorasick::stream_find_iter`] and /// [`AhoCorasick::try_stream_find_iter`] methods. /// /// The type variable `R` refers to the `io::Read` stream that is being read /// from. /// /// The lifetime `'a` refers to the lifetime of the corresponding /// [`AhoCorasick`] searcher. #[cfg(feature = "std")] #[derive(Debug)] pub struct StreamFindIter<'a, R>( automaton::StreamFindIter<'a, Arc, R>, ); #[cfg(feature = "std")] impl<'a, R: std::io::Read> Iterator for StreamFindIter<'a, R> { type Item = Result; fn next(&mut self) -> Option> { self.0.next() } } /// A builder for configuring an Aho-Corasick automaton. /// /// # Quick advice /// /// * Use [`AhoCorasickBuilder::match_kind`] to configure your searcher /// with [`MatchKind::LeftmostFirst`] if you want to match how backtracking /// regex engines execute searches for `pat1|pat2|..|patN`. Use /// [`MatchKind::LeftmostLongest`] if you want to match how POSIX regex engines /// do it. /// * If you need an anchored search, use [`AhoCorasickBuilder::start_kind`] to /// set the [`StartKind::Anchored`] mode since [`StartKind::Unanchored`] is the /// default. Or just use [`StartKind::Both`] to support both types of searches. /// * You might want to use [`AhoCorasickBuilder::kind`] to set your searcher /// to always use a [`AhoCorasickKind::DFA`] if search speed is critical and /// memory usage isn't a concern. Otherwise, not setting a kind will probably /// make the right choice for you. Beware that if you use [`StartKind::Both`] /// to build a searcher that supports both unanchored and anchored searches /// _and_ you set [`AhoCorasickKind::DFA`], then the DFA will essentially be /// duplicated to support both simultaneously. This results in very high memory /// usage. /// * For all other options, their defaults are almost certainly what you want. #[derive(Clone, Debug, Default)] pub struct AhoCorasickBuilder { nfa_noncontiguous: noncontiguous::Builder, nfa_contiguous: contiguous::Builder, dfa: dfa::Builder, kind: Option, start_kind: StartKind, } impl AhoCorasickBuilder { /// Create a new builder for configuring an Aho-Corasick automaton. /// /// The builder provides a way to configure a number of things, including /// ASCII case insensitivity and what kind of match semantics are used. pub fn new() -> AhoCorasickBuilder { AhoCorasickBuilder::default() } /// Build an Aho-Corasick automaton using the configuration set on this /// builder. /// /// A builder may be reused to create more automatons. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, PatternID}; /// /// let patterns = &["foo", "bar", "baz"]; /// let ac = AhoCorasickBuilder::new().build(patterns).unwrap(); /// assert_eq!( /// Some(PatternID::must(1)), /// ac.find("xxx bar xxx").map(|m| m.pattern()), /// ); /// ``` pub fn build(&self, patterns: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, { let nfa = self.nfa_noncontiguous.build(patterns)?; let (aut, kind): (Arc, AhoCorasickKind) = match self.kind { None => { debug!( "asked for automatic Aho-Corasick implementation, \ criteria: ", nfa.patterns_len(), nfa.max_pattern_len(), self.start_kind, ); self.build_auto(nfa) } Some(AhoCorasickKind::NoncontiguousNFA) => { debug!("forcefully chose noncontiguous NFA"); (Arc::new(nfa), AhoCorasickKind::NoncontiguousNFA) } Some(AhoCorasickKind::ContiguousNFA) => { debug!("forcefully chose contiguous NFA"); let cnfa = self.nfa_contiguous.build_from_noncontiguous(&nfa)?; (Arc::new(cnfa), AhoCorasickKind::ContiguousNFA) } Some(AhoCorasickKind::DFA) => { debug!("forcefully chose DFA"); let dfa = self.dfa.build_from_noncontiguous(&nfa)?; (Arc::new(dfa), AhoCorasickKind::DFA) } }; Ok(AhoCorasick { aut, kind, start_kind: self.start_kind }) } /// Implements the automatic selection logic for the Aho-Corasick /// implementation to use. Since all Aho-Corasick automatons are built /// from a non-contiguous NFA, the caller is responsible for building /// that first. fn build_auto( &self, nfa: noncontiguous::NFA, ) -> (Arc, AhoCorasickKind) { // We try to build a DFA if we have a very small number of patterns, // otherwise the memory usage just gets too crazy. We also only do it // when the start kind is unanchored or anchored, but not both, because // both implies two full copies of the transition table. let try_dfa = !matches!(self.start_kind, StartKind::Both) && nfa.patterns_len() <= 100; if try_dfa { match self.dfa.build_from_noncontiguous(&nfa) { Ok(dfa) => { debug!("chose a DFA"); return (Arc::new(dfa), AhoCorasickKind::DFA); } Err(_err) => { debug!( "failed to build DFA, trying something else: {}", _err ); } } } // We basically always want a contiguous NFA if the limited // circumstances in which we use a DFA are not true. It is quite fast // and has excellent memory usage. The only way we don't use it is if // there are so many states that it can't fit in a contiguous NFA. // And the only way to know that is to try to build it. Building a // contiguous NFA is mostly just reshuffling data from a noncontiguous // NFA, so it isn't too expensive, especially relative to building a // noncontiguous NFA in the first place. match self.nfa_contiguous.build_from_noncontiguous(&nfa) { Ok(nfa) => { debug!("chose contiguous NFA"); return (Arc::new(nfa), AhoCorasickKind::ContiguousNFA); } #[allow(unused_variables)] // unused when 'logging' is disabled Err(_err) => { debug!( "failed to build contiguous NFA, \ trying something else: {}", _err ); } } debug!("chose non-contiguous NFA"); (Arc::new(nfa), AhoCorasickKind::NoncontiguousNFA) } /// Set the desired match semantics. /// /// The default is [`MatchKind::Standard`], which corresponds to the match /// semantics supported by the standard textbook description of the /// Aho-Corasick algorithm. Namely, matches are reported as soon as they /// are found. Moreover, this is the only way to get overlapping matches /// or do stream searching. /// /// The other kinds of match semantics that are supported are /// [`MatchKind::LeftmostFirst`] and [`MatchKind::LeftmostLongest`]. The /// former corresponds to the match you would get if you were to try to /// match each pattern at each position in the haystack in the same order /// that you give to the automaton. That is, it returns the leftmost match /// corresponding to the earliest pattern given to the automaton. The /// latter corresponds to finding the longest possible match among all /// leftmost matches. /// /// For more details on match semantics, see the [documentation for /// `MatchKind`](MatchKind). /// /// Note that setting this to [`MatchKind::LeftmostFirst`] or /// [`MatchKind::LeftmostLongest`] will cause some search routines on /// [`AhoCorasick`] to return an error (or panic if you're using the /// infallible API). Notably, this includes stream and overlapping /// searches. /// /// # Examples /// /// In these examples, we demonstrate the differences between match /// semantics for a particular set of patterns in a specific order: /// `b`, `abc`, `abcd`. /// /// Standard semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::Standard) // default, not necessary /// .build(patterns) /// .unwrap(); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("b", &haystack[mat.start()..mat.end()]); /// ``` /// /// Leftmost-first semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); /// ``` /// /// Leftmost-longest semantics: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostLongest) /// .build(patterns) /// .unwrap(); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); /// ``` pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder { self.nfa_noncontiguous.match_kind(kind); self.nfa_contiguous.match_kind(kind); self.dfa.match_kind(kind); self } /// Sets the starting state configuration for the automaton. /// /// Every Aho-Corasick automaton is capable of having two start states: one /// that is used for unanchored searches and one that is used for anchored /// searches. Some automatons, like the NFAs, support this with almost zero /// additional cost. Other automatons, like the DFA, require two copies of /// the underlying transition table to support both simultaneously. /// /// Because there may be an added non-trivial cost to supporting both, it /// is possible to configure which starting state configuration is needed. /// /// Indeed, since anchored searches tend to be somewhat more rare, /// _only_ unanchored searches are supported by default. Thus, /// [`StartKind::Unanchored`] is the default. /// /// Note that when this is set to [`StartKind::Unanchored`], then /// running an anchored search will result in an error (or a panic /// if using the infallible APIs). Similarly, when this is set to /// [`StartKind::Anchored`], then running an unanchored search will /// result in an error (or a panic if using the infallible APIs). When /// [`StartKind::Both`] is used, then both unanchored and anchored searches /// are always supported. /// /// Also note that even if an `AhoCorasick` searcher is using an NFA /// internally (which always supports both unanchored and anchored /// searches), an error will still be reported for a search that isn't /// supported by the configuration set via this method. This means, /// for example, that an error is never dependent on which internal /// implementation of Aho-Corasick is used. /// /// # Example: anchored search /// /// This shows how to build a searcher that only supports anchored /// searches: /// /// ``` /// use aho_corasick::{ /// AhoCorasick, Anchored, Input, Match, MatchKind, StartKind, /// }; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .start_kind(StartKind::Anchored) /// .build(&["b", "abc", "abcd"]) /// .unwrap(); /// /// // An unanchored search is not supported! An error here is guaranteed /// // given the configuration above regardless of which kind of /// // Aho-Corasick implementation ends up being used internally. /// let input = Input::new("foo abcd").anchored(Anchored::No); /// assert!(ac.try_find(input).is_err()); /// /// let input = Input::new("foo abcd").anchored(Anchored::Yes); /// assert_eq!(None, ac.try_find(input)?); /// /// let input = Input::new("abcd").anchored(Anchored::Yes); /// assert_eq!(Some(Match::must(1, 0..3)), ac.try_find(input)?); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: unanchored and anchored searches /// /// This shows how to build a searcher that supports both unanchored and /// anchored searches: /// /// ``` /// use aho_corasick::{ /// AhoCorasick, Anchored, Input, Match, MatchKind, StartKind, /// }; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .start_kind(StartKind::Both) /// .build(&["b", "abc", "abcd"]) /// .unwrap(); /// /// let input = Input::new("foo abcd").anchored(Anchored::No); /// assert_eq!(Some(Match::must(1, 4..7)), ac.try_find(input)?); /// /// let input = Input::new("foo abcd").anchored(Anchored::Yes); /// assert_eq!(None, ac.try_find(input)?); /// /// let input = Input::new("abcd").anchored(Anchored::Yes); /// assert_eq!(Some(Match::must(1, 0..3)), ac.try_find(input)?); /// /// # Ok::<(), Box>(()) /// ``` pub fn start_kind(&mut self, kind: StartKind) -> &mut AhoCorasickBuilder { self.dfa.start_kind(kind); self.start_kind = kind; self } /// Enable ASCII-aware case insensitive matching. /// /// When this option is enabled, searching will be performed without /// respect to case for ASCII letters (`a-z` and `A-Z`) only. /// /// Enabling this option does not change the search algorithm, but it may /// increase the size of the automaton. /// /// **NOTE:** It is unlikely that support for Unicode case folding will /// be added in the future. The ASCII case works via a simple hack to the /// underlying automaton, but full Unicode handling requires a fair bit of /// sophistication. If you do need Unicode handling, you might consider /// using the [`regex` crate](https://docs.rs/regex) or the lower level /// [`regex-automata` crate](https://docs.rs/regex-automata). /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let patterns = &["FOO", "bAr", "BaZ"]; /// let haystack = "foo bar baz"; /// /// let ac = AhoCorasick::builder() /// .ascii_case_insensitive(true) /// .build(patterns) /// .unwrap(); /// assert_eq!(3, ac.find_iter(haystack).count()); /// ``` pub fn ascii_case_insensitive( &mut self, yes: bool, ) -> &mut AhoCorasickBuilder { self.nfa_noncontiguous.ascii_case_insensitive(yes); self.nfa_contiguous.ascii_case_insensitive(yes); self.dfa.ascii_case_insensitive(yes); self } /// Choose the type of underlying automaton to use. /// /// Currently, there are four choices: /// /// * [`AhoCorasickKind::NoncontiguousNFA`] instructs the searcher to /// use a [`noncontiguous::NFA`]. A noncontiguous NFA is the fastest to /// be built, has moderate memory usage and is typically the slowest to /// execute a search. /// * [`AhoCorasickKind::ContiguousNFA`] instructs the searcher to use a /// [`contiguous::NFA`]. A contiguous NFA is a little slower to build than /// a noncontiguous NFA, has excellent memory usage and is typically a /// little slower than a DFA for a search. /// * [`AhoCorasickKind::DFA`] instructs the searcher to use a /// [`dfa::DFA`]. A DFA is very slow to build, uses exorbitant amounts of /// memory, but will typically execute searches the fastest. /// * `None` (the default) instructs the searcher to choose the "best" /// Aho-Corasick implementation. This choice is typically based primarily /// on the number of patterns. /// /// Setting this configuration does not change the time complexity for /// constructing the Aho-Corasick automaton (which is `O(p)` where `p` /// is the total number of patterns being compiled). Setting this to /// [`AhoCorasickKind::DFA`] does however reduce the time complexity of /// non-overlapping searches from `O(n + p)` to `O(n)`, where `n` is the /// length of the haystack. /// /// In general, you should probably stick to the default unless you have /// some kind of reason to use a specific Aho-Corasick implementation. For /// example, you might choose `AhoCorasickKind::DFA` if you don't care /// about memory usage and want the fastest possible search times. /// /// Setting this guarantees that the searcher returned uses the chosen /// implementation. If that implementation could not be constructed, then /// an error will be returned. In contrast, when `None` is used, it is /// possible for it to attempt to construct, for example, a contiguous /// NFA and have it fail. In which case, it will fall back to using a /// noncontiguous NFA. /// /// If `None` is given, then one may use [`AhoCorasick::kind`] to determine /// which Aho-Corasick implementation was chosen. /// /// Note that the heuristics used for choosing which `AhoCorasickKind` /// may be changed in a semver compatible release. pub fn kind( &mut self, kind: Option, ) -> &mut AhoCorasickBuilder { self.kind = kind; self } /// Enable heuristic prefilter optimizations. /// /// When enabled, searching will attempt to quickly skip to match /// candidates using specialized literal search routines. A prefilter /// cannot always be used, and is generally treated as a heuristic. It /// can be useful to disable this if the prefilter is observed to be /// sub-optimal for a particular workload. /// /// Currently, prefilters are typically only active when building searchers /// with a small (less than 100) number of patterns. /// /// This is enabled by default. pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder { self.nfa_noncontiguous.prefilter(yes); self.nfa_contiguous.prefilter(yes); self.dfa.prefilter(yes); self } /// Set the limit on how many states use a dense representation for their /// transitions. Other states will generally use a sparse representation. /// /// A dense representation uses more memory but is generally faster, since /// the next transition in a dense representation can be computed in a /// constant number of instructions. A sparse representation uses less /// memory but is generally slower, since the next transition in a sparse /// representation requires executing a variable number of instructions. /// /// This setting is only used when an Aho-Corasick implementation is used /// that supports the dense versus sparse representation trade off. Not all /// do. /// /// This limit is expressed in terms of the depth of a state, i.e., the /// number of transitions from the starting state of the automaton. The /// idea is that most of the time searching will be spent near the starting /// state of the automaton, so states near the start state should use a /// dense representation. States further away from the start state would /// then use a sparse representation. /// /// By default, this is set to a low but non-zero number. Setting this to /// `0` is almost never what you want, since it is likely to make searches /// very slow due to the start state itself being forced to use a sparse /// representation. However, it is unlikely that increasing this number /// will help things much, since the most active states have a small depth. /// More to the point, the memory usage increases superlinearly as this /// number increases. pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder { self.nfa_noncontiguous.dense_depth(depth); self.nfa_contiguous.dense_depth(depth); self } /// A debug settting for whether to attempt to shrink the size of the /// automaton's alphabet or not. /// /// This option is enabled by default and should never be disabled unless /// one is debugging the underlying automaton. /// /// When enabled, some (but not all) Aho-Corasick automatons will use a map /// from all possible bytes to their corresponding equivalence class. Each /// equivalence class represents a set of bytes that does not discriminate /// between a match and a non-match in the automaton. /// /// The advantage of this map is that the size of the transition table can /// be reduced drastically from `#states * 256 * sizeof(u32)` to /// `#states * k * sizeof(u32)` where `k` is the number of equivalence /// classes (rounded up to the nearest power of 2). As a result, total /// space usage can decrease substantially. Moreover, since a smaller /// alphabet is used, automaton compilation becomes faster as well. /// /// **WARNING:** This is only useful for debugging automatons. Disabling /// this does not yield any speed advantages. Namely, even when this is /// disabled, a byte class map is still used while searching. The only /// difference is that every byte will be forced into its own distinct /// equivalence class. This is useful for debugging the actual generated /// transitions because it lets one see the transitions defined on actual /// bytes instead of the equivalence classes. pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder { self.nfa_contiguous.byte_classes(yes); self.dfa.byte_classes(yes); self } } /// The type of Aho-Corasick implementation to use in an [`AhoCorasick`] /// searcher. /// /// This is principally used as an input to the /// [`AhoCorasickBuilder::start_kind`] method. Its documentation goes into more /// detail about each choice. #[non_exhaustive] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum AhoCorasickKind { /// Use a noncontiguous NFA. NoncontiguousNFA, /// Use a contiguous NFA. ContiguousNFA, /// Use a DFA. Warning: DFAs typically use a large amount of memory. DFA, } /// A trait that effectively gives us practical dynamic dispatch over anything /// that impls `Automaton`, but without needing to add a bunch of bounds to /// the core `Automaton` trait. Basically, we provide all of the marker traits /// that our automatons have, in addition to `Debug` impls and requiring that /// there is no borrowed data. Without these, the main `AhoCorasick` type would /// not be able to meaningfully impl `Debug` or the marker traits without also /// requiring that all impls of `Automaton` do so, which would be not great. trait AcAutomaton: Automaton + Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static { } impl AcAutomaton for A where A: Automaton + Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static { } impl crate::automaton::private::Sealed for Arc {} // I'm not sure why this trait impl shows up in the docs, as the AcAutomaton // trait is not exported. So we forcefully hide it. // // SAFETY: This just defers to the underlying 'AcAutomaton' and thus inherits // its safety properties. #[doc(hidden)] unsafe impl Automaton for Arc { #[inline(always)] fn start_state(&self, anchored: Anchored) -> Result { (**self).start_state(anchored) } #[inline(always)] fn next_state( &self, anchored: Anchored, sid: StateID, byte: u8, ) -> StateID { (**self).next_state(anchored, sid, byte) } #[inline(always)] fn is_special(&self, sid: StateID) -> bool { (**self).is_special(sid) } #[inline(always)] fn is_dead(&self, sid: StateID) -> bool { (**self).is_dead(sid) } #[inline(always)] fn is_match(&self, sid: StateID) -> bool { (**self).is_match(sid) } #[inline(always)] fn is_start(&self, sid: StateID) -> bool { (**self).is_start(sid) } #[inline(always)] fn match_kind(&self) -> MatchKind { (**self).match_kind() } #[inline(always)] fn match_len(&self, sid: StateID) -> usize { (**self).match_len(sid) } #[inline(always)] fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { (**self).match_pattern(sid, index) } #[inline(always)] fn patterns_len(&self) -> usize { (**self).patterns_len() } #[inline(always)] fn pattern_len(&self, pid: PatternID) -> usize { (**self).pattern_len(pid) } #[inline(always)] fn min_pattern_len(&self) -> usize { (**self).min_pattern_len() } #[inline(always)] fn max_pattern_len(&self) -> usize { (**self).max_pattern_len() } #[inline(always)] fn memory_usage(&self) -> usize { (**self).memory_usage() } #[inline(always)] fn prefilter(&self) -> Option<&Prefilter> { (**self).prefilter() } // Even though 'try_find' and 'try_find_overlapping' each have their // own default impls, we explicitly define them here to fix a perf bug. // Without these explicit definitions, the default impl will wind up using // dynamic dispatch for all 'Automaton' method calls, including things like // 'next_state' that absolutely must get inlined or else perf is trashed. // Defining them explicitly here like this still requires dynamic dispatch // to call 'try_find' itself, but all uses of 'Automaton' within 'try_find' // are monomorphized. // // We don't need to explicitly impl any other methods, I think, because // they are all implemented themselves in terms of 'try_find' and // 'try_find_overlapping'. We still might wind up with an extra virtual // call here or there, but that's okay since it's outside of any perf // critical areas. #[inline(always)] fn try_find( &self, input: &Input<'_>, ) -> Result, MatchError> { (**self).try_find(input) } #[inline(always)] fn try_find_overlapping( &self, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { (**self).try_find_overlapping(input, state) } } /// Returns an error if the start state configuration does not support the /// desired search configuration. See the internal 'AhoCorasick::start_kind' /// field docs for more details. fn enforce_anchored_consistency( have: StartKind, want: Anchored, ) -> Result<(), MatchError> { match have { StartKind::Both => Ok(()), StartKind::Unanchored if !want.is_anchored() => Ok(()), StartKind::Unanchored => Err(MatchError::invalid_input_anchored()), StartKind::Anchored if want.is_anchored() => Ok(()), StartKind::Anchored => Err(MatchError::invalid_input_unanchored()), } } aho-corasick-1.1.3/src/automaton.rs000064400000000000000000001761571046102023000153560ustar 00000000000000/*! Provides [`Automaton`] trait for abstracting over Aho-Corasick automata. The `Automaton` trait provides a way to write generic code over any Aho-Corasick automaton. It also provides access to lower level APIs that permit walking the state transitions of an Aho-Corasick automaton manually. */ use alloc::{string::String, vec::Vec}; use crate::util::{ error::MatchError, primitives::PatternID, search::{Anchored, Input, Match, MatchKind, Span}, }; pub use crate::util::{ prefilter::{Candidate, Prefilter}, primitives::{StateID, StateIDError}, }; /// We seal the `Automaton` trait for now. It's a big trait, and it's /// conceivable that I might want to add new required methods, and sealing the /// trait permits doing that in a backwards compatible fashion. On other the /// hand, if you have a solid use case for implementing the trait yourself, /// please file an issue and we can discuss it. This was *mostly* done as a /// conservative step. pub(crate) mod private { pub trait Sealed {} } impl private::Sealed for crate::nfa::noncontiguous::NFA {} impl private::Sealed for crate::nfa::contiguous::NFA {} impl private::Sealed for crate::dfa::DFA {} impl<'a, T: private::Sealed + ?Sized> private::Sealed for &'a T {} /// A trait that abstracts over Aho-Corasick automata. /// /// This trait primarily exists for niche use cases such as: /// /// * Using an NFA or DFA directly, bypassing the top-level /// [`AhoCorasick`](crate::AhoCorasick) searcher. Currently, these include /// [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA), /// [`contiguous::NFA`](crate::nfa::contiguous::NFA) and /// [`dfa::DFA`](crate::dfa::DFA). /// * Implementing your own custom search routine by walking the automaton /// yourself. This might be useful for implementing search on non-contiguous /// strings or streams. /// /// For most use cases, it is not expected that users will need /// to use or even know about this trait. Indeed, the top level /// [`AhoCorasick`](crate::AhoCorasick) searcher does not expose any details /// about this trait, nor does it implement it itself. /// /// Note that this trait defines a number of default methods, such as /// [`Automaton::try_find`] and [`Automaton::try_find_iter`], which implement /// higher level search routines in terms of the lower level automata API. /// /// # Sealed /// /// Currently, this trait is sealed. That means users of this crate can write /// generic routines over this trait but cannot implement it themselves. This /// restriction may be lifted in the future, but sealing the trait permits /// adding new required methods in a backwards compatible fashion. /// /// # Special states /// /// This trait encodes a notion of "special" states in an automaton. Namely, /// a state is treated as special if it is a dead, match or start state: /// /// * A dead state is a state that cannot be left once entered. All transitions /// on a dead state lead back to itself. The dead state is meant to be treated /// as a sentinel indicating that the search should stop and return a match if /// one has been found, and nothing otherwise. /// * A match state is a state that indicates one or more patterns have /// matched. Depending on the [`MatchKind`] of the automaton, a search may /// stop once a match is seen, or it may continue looking for matches until /// it enters a dead state or sees the end of the haystack. /// * A start state is a state that a search begins in. It is useful to know /// when a search enters a start state because it may mean that a prefilter can /// be used to skip ahead and quickly look for candidate matches. Unlike dead /// and match states, it is never necessary to explicitly handle start states /// for correctness. Indeed, in this crate, implementations of `Automaton` /// will only treat start states as "special" when a prefilter is enabled and /// active. Otherwise, treating it as special has no purpose and winds up /// slowing down the overall search because it results in ping-ponging between /// the main state transition and the "special" state logic. /// /// Since checking whether a state is special by doing three different /// checks would be too expensive inside a fast search loop, the /// [`Automaton::is_special`] method is provided for quickly checking whether /// the state is special. The `Automaton::is_dead`, `Automaton::is_match` and /// `Automaton::is_start` predicates can then be used to determine which kind /// of special state it is. /// /// # Panics /// /// Most of the APIs on this trait should panic or give incorrect results /// if invalid inputs are given to it. For example, `Automaton::next_state` /// has unspecified behavior if the state ID given to it is not a valid /// state ID for the underlying automaton. Valid state IDs can only be /// retrieved in one of two ways: calling `Automaton::start_state` or calling /// `Automaton::next_state` with a valid state ID. /// /// # Safety /// /// This trait is not safe to implement so that code may rely on the /// correctness of implementations of this trait to avoid undefined behavior. /// The primary correctness guarantees are: /// /// * `Automaton::start_state` always returns a valid state ID or an error or /// panics. /// * `Automaton::next_state`, when given a valid state ID, always returns /// a valid state ID for all values of `anchored` and `byte`, or otherwise /// panics. /// /// In general, the rest of the methods on `Automaton` need to uphold their /// contracts as well. For example, `Automaton::is_dead` should only returns /// true if the given state ID is actually a dead state. /// /// Note that currently this crate does not rely on the safety property defined /// here to avoid undefined behavior. Instead, this was done to make it /// _possible_ to do in the future. /// /// # Example /// /// This example shows how one might implement a basic but correct search /// routine. We keep things simple by not using prefilters or worrying about /// anchored searches, but do make sure our search is correct for all possible /// [`MatchKind`] semantics. (The comments in the code below note the parts /// that are needed to support certain `MatchKind` semantics.) /// /// ``` /// use aho_corasick::{ /// automaton::Automaton, /// nfa::noncontiguous::NFA, /// Anchored, Match, MatchError, MatchKind, /// }; /// /// // Run an unanchored search for 'aut' in 'haystack'. Return the first match /// // seen according to the automaton's match semantics. This returns an error /// // if the given automaton does not support unanchored searches. /// fn find( /// aut: A, /// haystack: &[u8], /// ) -> Result, MatchError> { /// let mut sid = aut.start_state(Anchored::No)?; /// let mut at = 0; /// let mut mat = None; /// let get_match = |sid, at| { /// let pid = aut.match_pattern(sid, 0); /// let len = aut.pattern_len(pid); /// Match::new(pid, (at - len)..at) /// }; /// // Start states can be match states! /// if aut.is_match(sid) { /// mat = Some(get_match(sid, at)); /// // Standard semantics require matches to be reported as soon as /// // they're seen. Otherwise, we continue until we see a dead state /// // or the end of the haystack. /// if matches!(aut.match_kind(), MatchKind::Standard) { /// return Ok(mat); /// } /// } /// while at < haystack.len() { /// sid = aut.next_state(Anchored::No, sid, haystack[at]); /// if aut.is_special(sid) { /// if aut.is_dead(sid) { /// return Ok(mat); /// } else if aut.is_match(sid) { /// mat = Some(get_match(sid, at + 1)); /// // As above, standard semantics require that we return /// // immediately once a match is found. /// if matches!(aut.match_kind(), MatchKind::Standard) { /// return Ok(mat); /// } /// } /// } /// at += 1; /// } /// Ok(mat) /// } /// /// // Show that it works for standard searches. /// let nfa = NFA::new(&["samwise", "sam"]).unwrap(); /// assert_eq!(Some(Match::must(1, 0..3)), find(&nfa, b"samwise")?); /// /// // But also works when using leftmost-first. Notice how the match result /// // has changed! /// let nfa = NFA::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(&["samwise", "sam"]) /// .unwrap(); /// assert_eq!(Some(Match::must(0, 0..7)), find(&nfa, b"samwise")?); /// /// # Ok::<(), Box>(()) /// ``` pub unsafe trait Automaton: private::Sealed { /// Returns the starting state for the given anchor mode. /// /// Upon success, the state ID returned is guaranteed to be valid for /// this automaton. /// /// # Errors /// /// This returns an error when the given search configuration is not /// supported by the underlying automaton. For example, if the underlying /// automaton only supports unanchored searches but the given configuration /// was set to an anchored search, then this must return an error. fn start_state(&self, anchored: Anchored) -> Result; /// Performs a state transition from `sid` for `byte` and returns the next /// state. /// /// `anchored` should be [`Anchored::Yes`] when executing an anchored /// search and [`Anchored::No`] otherwise. For some implementations of /// `Automaton`, it is required to know whether the search is anchored /// or not in order to avoid following failure transitions. Other /// implementations may ignore `anchored` altogether and depend on /// `Automaton::start_state` returning a state that walks a different path /// through the automaton depending on whether the search is anchored or /// not. /// /// # Panics /// /// This routine may panic or return incorrect results when the given state /// ID is invalid. A state ID is valid if and only if: /// /// 1. It came from a call to `Automaton::start_state`, or /// 2. It came from a previous call to `Automaton::next_state` with a /// valid state ID. /// /// Implementations must treat all possible values of `byte` as valid. /// /// Implementations may panic on unsupported values of `anchored`, but are /// not required to do so. fn next_state( &self, anchored: Anchored, sid: StateID, byte: u8, ) -> StateID; /// Returns true if the given ID represents a "special" state. A special /// state is a dead, match or start state. /// /// Note that implementations may choose to return false when the given ID /// corresponds to a start state. Namely, it always correct to treat start /// states as non-special. Implementations must return true for states that /// are dead or contain matches. /// /// This has unspecified behavior when given an invalid state ID. fn is_special(&self, sid: StateID) -> bool; /// Returns true if the given ID represents a dead state. /// /// A dead state is a type of "sink" in a finite state machine. It /// corresponds to a state whose transitions all loop back to itself. That /// is, once entered, it can never be left. In practice, it serves as a /// sentinel indicating that the search should terminate. /// /// This has unspecified behavior when given an invalid state ID. fn is_dead(&self, sid: StateID) -> bool; /// Returns true if the given ID represents a match state. /// /// A match state is always associated with one or more pattern IDs that /// matched at the position in the haystack when the match state was /// entered. When a match state is entered, the match semantics dictate /// whether it should be returned immediately (for `MatchKind::Standard`) /// or if the search should continue (for `MatchKind::LeftmostFirst` and /// `MatchKind::LeftmostLongest`) until a dead state is seen or the end of /// the haystack has been reached. /// /// This has unspecified behavior when given an invalid state ID. fn is_match(&self, sid: StateID) -> bool; /// Returns true if the given ID represents a start state. /// /// While it is never incorrect to ignore start states during a search /// (except for the start of the search of course), knowing whether one has /// entered a start state can be useful for certain classes of performance /// optimizations. For example, if one is in a start state, it may be legal /// to try to skip ahead and look for match candidates more quickly than /// would otherwise be accomplished by walking the automaton. /// /// Implementations of `Automaton` in this crate "unspecialize" start /// states when a prefilter is not active or enabled. In this case, it /// is possible for `Automaton::is_special(sid)` to return false while /// `Automaton::is_start(sid)` returns true. /// /// This has unspecified behavior when given an invalid state ID. fn is_start(&self, sid: StateID) -> bool; /// Returns the match semantics that this automaton was built with. fn match_kind(&self) -> MatchKind; /// Returns the total number of matches for the given state ID. /// /// This has unspecified behavior if the given ID does not refer to a match /// state. fn match_len(&self, sid: StateID) -> usize; /// Returns the pattern ID for the match state given by `sid` at the /// `index` given. /// /// Typically, `index` is only ever greater than `0` when implementing an /// overlapping search. Otherwise, it's likely that your search only cares /// about reporting the first pattern ID in a match state. /// /// This has unspecified behavior if the given ID does not refer to a match /// state, or if the index is greater than or equal to the total number of /// matches in this match state. fn match_pattern(&self, sid: StateID, index: usize) -> PatternID; /// Returns the total number of patterns compiled into this automaton. fn patterns_len(&self) -> usize; /// Returns the length of the pattern for the given ID. /// /// This has unspecified behavior when given an invalid pattern /// ID. A pattern ID is valid if and only if it is less than /// `Automaton::patterns_len`. fn pattern_len(&self, pid: PatternID) -> usize; /// Returns the length, in bytes, of the shortest pattern in this /// automaton. fn min_pattern_len(&self) -> usize; /// Returns the length, in bytes, of the longest pattern in this automaton. fn max_pattern_len(&self) -> usize; /// Returns the heap memory usage, in bytes, used by this automaton. fn memory_usage(&self) -> usize; /// Returns a prefilter, if available, that can be used to accelerate /// searches for this automaton. /// /// The typical way this is used is when the start state is entered during /// a search. When that happens, one can use a prefilter to skip ahead and /// look for candidate matches without having to walk the automaton on the /// bytes between candidates. /// /// Typically a prefilter is only available when there are a small (<100) /// number of patterns built into the automaton. fn prefilter(&self) -> Option<&Prefilter>; /// Executes a non-overlapping search with this automaton using the given /// configuration. /// /// See /// [`AhoCorasick::try_find`](crate::AhoCorasick::try_find) /// for more documentation and examples. fn try_find( &self, input: &Input<'_>, ) -> Result, MatchError> { try_find_fwd(&self, input) } /// Executes a overlapping search with this automaton using the given /// configuration. /// /// See /// [`AhoCorasick::try_find_overlapping`](crate::AhoCorasick::try_find_overlapping) /// for more documentation and examples. fn try_find_overlapping( &self, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { try_find_overlapping_fwd(&self, input, state) } /// Returns an iterator of non-overlapping matches with this automaton /// using the given configuration. /// /// See /// [`AhoCorasick::try_find_iter`](crate::AhoCorasick::try_find_iter) /// for more documentation and examples. fn try_find_iter<'a, 'h>( &'a self, input: Input<'h>, ) -> Result, MatchError> where Self: Sized, { FindIter::new(self, input) } /// Returns an iterator of overlapping matches with this automaton /// using the given configuration. /// /// See /// [`AhoCorasick::try_find_overlapping_iter`](crate::AhoCorasick::try_find_overlapping_iter) /// for more documentation and examples. fn try_find_overlapping_iter<'a, 'h>( &'a self, input: Input<'h>, ) -> Result, MatchError> where Self: Sized, { if !self.match_kind().is_standard() { return Err(MatchError::unsupported_overlapping( self.match_kind(), )); } // We might consider lifting this restriction. The reason why I added // it was to ban the combination of "anchored search" and "overlapping // iteration." The match semantics aren't totally clear in that case. // Should we allow *any* matches that are adjacent to *any* previous // match? Or only following the most recent one? Or only matches // that start at the beginning of the search? We might also elect to // just keep this restriction in place, as callers should be able to // implement it themselves if they want to. if input.get_anchored().is_anchored() { return Err(MatchError::invalid_input_anchored()); } let _ = self.start_state(input.get_anchored())?; let state = OverlappingState::start(); Ok(FindOverlappingIter { aut: self, input, state }) } /// Replaces all non-overlapping matches in `haystack` with /// strings from `replace_with` depending on the pattern that /// matched. The `replace_with` slice must have length equal to /// `Automaton::patterns_len`. /// /// See /// [`AhoCorasick::try_replace_all`](crate::AhoCorasick::try_replace_all) /// for more documentation and examples. fn try_replace_all( &self, haystack: &str, replace_with: &[B], ) -> Result where Self: Sized, B: AsRef, { assert_eq!( replace_with.len(), self.patterns_len(), "replace_all requires a replacement for every pattern \ in the automaton" ); let mut dst = String::with_capacity(haystack.len()); self.try_replace_all_with(haystack, &mut dst, |mat, _, dst| { dst.push_str(replace_with[mat.pattern()].as_ref()); true })?; Ok(dst) } /// Replaces all non-overlapping matches in `haystack` with /// strings from `replace_with` depending on the pattern that /// matched. The `replace_with` slice must have length equal to /// `Automaton::patterns_len`. /// /// See /// [`AhoCorasick::try_replace_all_bytes`](crate::AhoCorasick::try_replace_all_bytes) /// for more documentation and examples. fn try_replace_all_bytes( &self, haystack: &[u8], replace_with: &[B], ) -> Result, MatchError> where Self: Sized, B: AsRef<[u8]>, { assert_eq!( replace_with.len(), self.patterns_len(), "replace_all requires a replacement for every pattern \ in the automaton" ); let mut dst = Vec::with_capacity(haystack.len()); self.try_replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| { dst.extend(replace_with[mat.pattern()].as_ref()); true })?; Ok(dst) } /// Replaces all non-overlapping matches in `haystack` by calling the /// `replace_with` closure given. /// /// See /// [`AhoCorasick::try_replace_all_with`](crate::AhoCorasick::try_replace_all_with) /// for more documentation and examples. fn try_replace_all_with( &self, haystack: &str, dst: &mut String, mut replace_with: F, ) -> Result<(), MatchError> where Self: Sized, F: FnMut(&Match, &str, &mut String) -> bool, { let mut last_match = 0; for m in self.try_find_iter(Input::new(haystack))? { // Since there are no restrictions on what kinds of patterns are // in an Aho-Corasick automaton, we might get matches that split // a codepoint, or even matches of a partial codepoint. When that // happens, we just skip the match. if !haystack.is_char_boundary(m.start()) || !haystack.is_char_boundary(m.end()) { continue; } dst.push_str(&haystack[last_match..m.start()]); last_match = m.end(); if !replace_with(&m, &haystack[m.start()..m.end()], dst) { break; }; } dst.push_str(&haystack[last_match..]); Ok(()) } /// Replaces all non-overlapping matches in `haystack` by calling the /// `replace_with` closure given. /// /// See /// [`AhoCorasick::try_replace_all_with_bytes`](crate::AhoCorasick::try_replace_all_with_bytes) /// for more documentation and examples. fn try_replace_all_with_bytes( &self, haystack: &[u8], dst: &mut Vec, mut replace_with: F, ) -> Result<(), MatchError> where Self: Sized, F: FnMut(&Match, &[u8], &mut Vec) -> bool, { let mut last_match = 0; for m in self.try_find_iter(Input::new(haystack))? { dst.extend(&haystack[last_match..m.start()]); last_match = m.end(); if !replace_with(&m, &haystack[m.start()..m.end()], dst) { break; }; } dst.extend(&haystack[last_match..]); Ok(()) } /// Returns an iterator of non-overlapping matches with this automaton /// from the stream given. /// /// See /// [`AhoCorasick::try_stream_find_iter`](crate::AhoCorasick::try_stream_find_iter) /// for more documentation and examples. #[cfg(feature = "std")] fn try_stream_find_iter<'a, R: std::io::Read>( &'a self, rdr: R, ) -> Result, MatchError> where Self: Sized, { Ok(StreamFindIter { it: StreamChunkIter::new(self, rdr)? }) } /// Replaces all non-overlapping matches in `rdr` with strings from /// `replace_with` depending on the pattern that matched, and writes the /// result to `wtr`. The `replace_with` slice must have length equal to /// `Automaton::patterns_len`. /// /// See /// [`AhoCorasick::try_stream_replace_all`](crate::AhoCorasick::try_stream_replace_all) /// for more documentation and examples. #[cfg(feature = "std")] fn try_stream_replace_all( &self, rdr: R, wtr: W, replace_with: &[B], ) -> std::io::Result<()> where Self: Sized, R: std::io::Read, W: std::io::Write, B: AsRef<[u8]>, { assert_eq!( replace_with.len(), self.patterns_len(), "streaming replace_all requires a replacement for every pattern \ in the automaton", ); self.try_stream_replace_all_with(rdr, wtr, |mat, _, wtr| { wtr.write_all(replace_with[mat.pattern()].as_ref()) }) } /// Replaces all non-overlapping matches in `rdr` by calling the /// `replace_with` closure given and writing the result to `wtr`. /// /// See /// [`AhoCorasick::try_stream_replace_all_with`](crate::AhoCorasick::try_stream_replace_all_with) /// for more documentation and examples. #[cfg(feature = "std")] fn try_stream_replace_all_with( &self, rdr: R, mut wtr: W, mut replace_with: F, ) -> std::io::Result<()> where Self: Sized, R: std::io::Read, W: std::io::Write, F: FnMut(&Match, &[u8], &mut W) -> std::io::Result<()>, { let mut it = StreamChunkIter::new(self, rdr).map_err(|e| { let kind = std::io::ErrorKind::Other; std::io::Error::new(kind, e) })?; while let Some(result) = it.next() { let chunk = result?; match chunk { StreamChunk::NonMatch { bytes, .. } => { wtr.write_all(bytes)?; } StreamChunk::Match { bytes, mat } => { replace_with(&mat, bytes, &mut wtr)?; } } } Ok(()) } } // SAFETY: This just defers to the underlying 'AcAutomaton' and thus inherits // its safety properties. unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { #[inline(always)] fn start_state(&self, anchored: Anchored) -> Result { (**self).start_state(anchored) } #[inline(always)] fn next_state( &self, anchored: Anchored, sid: StateID, byte: u8, ) -> StateID { (**self).next_state(anchored, sid, byte) } #[inline(always)] fn is_special(&self, sid: StateID) -> bool { (**self).is_special(sid) } #[inline(always)] fn is_dead(&self, sid: StateID) -> bool { (**self).is_dead(sid) } #[inline(always)] fn is_match(&self, sid: StateID) -> bool { (**self).is_match(sid) } #[inline(always)] fn is_start(&self, sid: StateID) -> bool { (**self).is_start(sid) } #[inline(always)] fn match_kind(&self) -> MatchKind { (**self).match_kind() } #[inline(always)] fn match_len(&self, sid: StateID) -> usize { (**self).match_len(sid) } #[inline(always)] fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { (**self).match_pattern(sid, index) } #[inline(always)] fn patterns_len(&self) -> usize { (**self).patterns_len() } #[inline(always)] fn pattern_len(&self, pid: PatternID) -> usize { (**self).pattern_len(pid) } #[inline(always)] fn min_pattern_len(&self) -> usize { (**self).min_pattern_len() } #[inline(always)] fn max_pattern_len(&self) -> usize { (**self).max_pattern_len() } #[inline(always)] fn memory_usage(&self) -> usize { (**self).memory_usage() } #[inline(always)] fn prefilter(&self) -> Option<&Prefilter> { (**self).prefilter() } } /// Represents the current state of an overlapping search. /// /// This is used for overlapping searches since they need to know something /// about the previous search. For example, when multiple patterns match at the /// same position, this state tracks the last reported pattern so that the next /// search knows whether to report another matching pattern or continue with /// the search at the next position. Additionally, it also tracks which state /// the last search call terminated in and the current offset of the search /// in the haystack. /// /// This type provides limited introspection capabilities. The only thing a /// caller can do is construct it and pass it around to permit search routines /// to use it to track state, and to ask whether a match has been found. /// /// Callers should always provide a fresh state constructed via /// [`OverlappingState::start`] when starting a new search. That same state /// should be reused for subsequent searches on the same `Input`. The state /// given will advance through the haystack itself. Callers can detect the end /// of a search when neither an error nor a match is returned. /// /// # Example /// /// This example shows how to manually iterate over all overlapping matches. If /// you need this, you might consider using /// [`AhoCorasick::find_overlapping_iter`](crate::AhoCorasick::find_overlapping_iter) /// instead, but this shows how to correctly use an `OverlappingState`. /// /// ``` /// use aho_corasick::{ /// automaton::OverlappingState, /// AhoCorasick, Input, Match, /// }; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns).unwrap(); /// let mut state = OverlappingState::start(); /// let mut matches = vec![]; /// /// loop { /// ac.find_overlapping(haystack, &mut state); /// let mat = match state.get_match() { /// None => break, /// Some(mat) => mat, /// }; /// matches.push(mat); /// } /// let expected = vec![ /// Match::must(2, 0..3), /// Match::must(0, 0..6), /// Match::must(2, 11..14), /// Match::must(2, 22..25), /// Match::must(0, 22..28), /// Match::must(1, 22..31), /// ]; /// assert_eq!(expected, matches); /// ``` #[derive(Clone, Debug)] pub struct OverlappingState { /// The match reported by the most recent overlapping search to use this /// state. /// /// If a search does not find any matches, then it is expected to clear /// this value. mat: Option, /// The state ID of the state at which the search was in when the call /// terminated. When this is a match state, `last_match` must be set to a /// non-None value. /// /// A `None` value indicates the start state of the corresponding /// automaton. We cannot use the actual ID, since any one automaton may /// have many start states, and which one is in use depends on search-time /// factors (such as whether the search is anchored or not). id: Option, /// The position of the search. /// /// When `id` is None (i.e., we are starting a search), this is set to /// the beginning of the search as given by the caller regardless of its /// current value. Subsequent calls to an overlapping search pick up at /// this offset. at: usize, /// The index into the matching patterns of the next match to report if the /// current state is a match state. Note that this may be 1 greater than /// the total number of matches to report for the current match state. (In /// which case, no more matches should be reported at the current position /// and the search should advance to the next position.) next_match_index: Option, } impl OverlappingState { /// Create a new overlapping state that begins at the start state. pub fn start() -> OverlappingState { OverlappingState { mat: None, id: None, at: 0, next_match_index: None } } /// Return the match result of the most recent search to execute with this /// state. /// /// Every search will clear this result automatically, such that if no /// match is found, this will always correctly report `None`. pub fn get_match(&self) -> Option { self.mat } } /// An iterator of non-overlapping matches in a particular haystack. /// /// This iterator yields matches according to the [`MatchKind`] used by this /// automaton. /// /// This iterator is constructed via the [`Automaton::try_find_iter`] method. /// /// The type variable `A` refers to the implementation of the [`Automaton`] /// trait used to execute the search. /// /// The lifetime `'a` refers to the lifetime of the [`Automaton`] /// implementation. /// /// The lifetime `'h` refers to the lifetime of the haystack being searched. #[derive(Debug)] pub struct FindIter<'a, 'h, A> { /// The automaton used to drive the search. aut: &'a A, /// The input parameters to give to each search call. /// /// The start position of the search is mutated during iteration. input: Input<'h>, /// Records the end offset of the most recent match. This is necessary to /// handle a corner case for preventing empty matches from overlapping with /// the ending bounds of a prior match. last_match_end: Option, } impl<'a, 'h, A: Automaton> FindIter<'a, 'h, A> { /// Creates a new non-overlapping iterator. If the given automaton would /// return an error on a search with the given input configuration, then /// that error is returned here. fn new( aut: &'a A, input: Input<'h>, ) -> Result, MatchError> { // The only way this search can fail is if we cannot retrieve the start // state. e.g., Asking for an anchored search when only unanchored // searches are supported. let _ = aut.start_state(input.get_anchored())?; Ok(FindIter { aut, input, last_match_end: None }) } /// Executes a search and returns a match if one is found. /// /// This does not advance the input forward. It just executes a search /// based on the current configuration/offsets. fn search(&self) -> Option { // The unwrap is OK here because we check at iterator construction time // that no subsequent search call (using the same configuration) will // ever return an error. self.aut .try_find(&self.input) .expect("already checked that no match error can occur") } /// Handles the special case of an empty match by ensuring that 1) the /// iterator always advances and 2) empty matches never overlap with other /// matches. /// /// (1) is necessary because we principally make progress by setting the /// starting location of the next search to the ending location of the last /// match. But if a match is empty, then this results in a search that does /// not advance and thus does not terminate. /// /// (2) is not strictly necessary, but makes intuitive sense and matches /// the presiding behavior of most general purpose regex engines. /// (Obviously this crate isn't a regex engine, but we choose to match /// their semantics.) The "intuitive sense" here is that we want to report /// NON-overlapping matches. So for example, given the patterns 'a' and /// '' (an empty string) against the haystack 'a', without the special /// handling, you'd get the matches [0, 1) and [1, 1), where the latter /// overlaps with the end bounds of the former. /// /// Note that we mark this cold and forcefully prevent inlining because /// handling empty matches like this is extremely rare and does require /// quite a bit of code, comparatively. Keeping this code out of the main /// iterator function keeps it smaller and more amenable to inlining /// itself. #[cold] #[inline(never)] fn handle_overlapping_empty_match( &mut self, mut m: Match, ) -> Option { assert!(m.is_empty()); if Some(m.end()) == self.last_match_end { self.input.set_start(self.input.start().checked_add(1).unwrap()); m = self.search()?; } Some(m) } } impl<'a, 'h, A: Automaton> Iterator for FindIter<'a, 'h, A> { type Item = Match; #[inline(always)] fn next(&mut self) -> Option { let mut m = self.search()?; if m.is_empty() { m = self.handle_overlapping_empty_match(m)?; } self.input.set_start(m.end()); self.last_match_end = Some(m.end()); Some(m) } } /// An iterator of overlapping matches in a particular haystack. /// /// This iterator will report all possible matches in a particular haystack, /// even when the matches overlap. /// /// This iterator is constructed via the /// [`Automaton::try_find_overlapping_iter`] method. /// /// The type variable `A` refers to the implementation of the [`Automaton`] /// trait used to execute the search. /// /// The lifetime `'a` refers to the lifetime of the [`Automaton`] /// implementation. /// /// The lifetime `'h` refers to the lifetime of the haystack being searched. #[derive(Debug)] pub struct FindOverlappingIter<'a, 'h, A> { aut: &'a A, input: Input<'h>, state: OverlappingState, } impl<'a, 'h, A: Automaton> Iterator for FindOverlappingIter<'a, 'h, A> { type Item = Match; #[inline(always)] fn next(&mut self) -> Option { self.aut .try_find_overlapping(&self.input, &mut self.state) .expect("already checked that no match error can occur here"); self.state.get_match() } } /// An iterator that reports matches in a stream. /// /// This iterator yields elements of type `io::Result`, where an error /// is reported if there was a problem reading from the underlying stream. /// The iterator terminates only when the underlying stream reaches `EOF`. /// /// This iterator is constructed via the [`Automaton::try_stream_find_iter`] /// method. /// /// The type variable `A` refers to the implementation of the [`Automaton`] /// trait used to execute the search. /// /// The type variable `R` refers to the `io::Read` stream that is being read /// from. /// /// The lifetime `'a` refers to the lifetime of the [`Automaton`] /// implementation. #[cfg(feature = "std")] #[derive(Debug)] pub struct StreamFindIter<'a, A, R> { it: StreamChunkIter<'a, A, R>, } #[cfg(feature = "std")] impl<'a, A: Automaton, R: std::io::Read> Iterator for StreamFindIter<'a, A, R> { type Item = std::io::Result; fn next(&mut self) -> Option> { loop { match self.it.next() { None => return None, Some(Err(err)) => return Some(Err(err)), Some(Ok(StreamChunk::NonMatch { .. })) => {} Some(Ok(StreamChunk::Match { mat, .. })) => { return Some(Ok(mat)); } } } } } /// An iterator that reports matches in a stream. /// /// (This doesn't actually implement the `Iterator` trait because it returns /// something with a lifetime attached to a buffer it owns, but that's OK. It /// still has a `next` method and is iterator-like enough to be fine.) /// /// This iterator yields elements of type `io::Result`, where /// an error is reported if there was a problem reading from the underlying /// stream. The iterator terminates only when the underlying stream reaches /// `EOF`. /// /// The idea here is that each chunk represents either a match or a non-match, /// and if you concatenated all of the chunks together, you'd reproduce the /// entire contents of the stream, byte-for-byte. /// /// This chunk machinery is a bit complicated and it isn't strictly required /// for a stream searcher that just reports matches. But we do need something /// like this to deal with the "replacement" API, which needs to know which /// chunks it can copy and which it needs to replace. #[cfg(feature = "std")] #[derive(Debug)] struct StreamChunkIter<'a, A, R> { /// The underlying automaton to do the search. aut: &'a A, /// The source of bytes we read from. rdr: R, /// A roll buffer for managing bytes from `rdr`. Basically, this is used /// to handle the case of a match that is split by two different /// calls to `rdr.read()`. This isn't strictly needed if all we needed to /// do was report matches, but here we are reporting chunks of non-matches /// and matches and in order to do that, we really just cannot treat our /// stream as non-overlapping blocks of bytes. We need to permit some /// overlap while we retain bytes from a previous `read` call in memory. buf: crate::util::buffer::Buffer, /// The unanchored starting state of this automaton. start: StateID, /// The state of the automaton. sid: StateID, /// The absolute position over the entire stream. absolute_pos: usize, /// The position we're currently at within `buf`. buffer_pos: usize, /// The buffer position of the end of the bytes that we last returned /// to the caller. Basically, whenever we find a match, we look to see if /// there is a difference between where the match started and the position /// of the last byte we returned to the caller. If there's a difference, /// then we need to return a 'NonMatch' chunk. buffer_reported_pos: usize, } #[cfg(feature = "std")] impl<'a, A: Automaton, R: std::io::Read> StreamChunkIter<'a, A, R> { fn new( aut: &'a A, rdr: R, ) -> Result, MatchError> { // This restriction is a carry-over from older versions of this crate. // I didn't have the bandwidth to think through how to handle, say, // leftmost-first or leftmost-longest matching, but... it should be // possible? The main problem is that once you see a match state in // leftmost-first semantics, you can't just stop at that point and // report a match. You have to keep going until you either hit a dead // state or EOF. So how do you know when you'll hit a dead state? Well, // you don't. With Aho-Corasick, I believe you can put a bound on it // and say, "once a match has been seen, you'll need to scan forward at // most N bytes" where N=aut.max_pattern_len(). // // Which is fine, but it does mean that state about whether we're still // looking for a dead state or not needs to persist across buffer // refills. Which this code doesn't really handle. It does preserve // *some* state across buffer refills, basically ensuring that a match // span is always in memory. if !aut.match_kind().is_standard() { return Err(MatchError::unsupported_stream(aut.match_kind())); } // This is kind of a cop-out, but empty matches are SUPER annoying. // If we know they can't happen (which is what we enforce here), then // it makes a lot of logic much simpler. With that said, I'm open to // supporting this case, but we need to define proper semantics for it // first. It wasn't totally clear to me what it should do at the time // of writing, so I decided to just be conservative. // // It also seems like a very weird case to support anyway. Why search a // stream if you're just going to get a match at every position? // // ¯\_(ツ)_/¯ if aut.min_pattern_len() == 0 { return Err(MatchError::unsupported_empty()); } let start = aut.start_state(Anchored::No)?; Ok(StreamChunkIter { aut, rdr, buf: crate::util::buffer::Buffer::new(aut.max_pattern_len()), start, sid: start, absolute_pos: 0, buffer_pos: 0, buffer_reported_pos: 0, }) } fn next(&mut self) -> Option> { // This code is pretty gnarly. It IS simpler than the equivalent code // in the previous aho-corasick release, in part because we inline // automaton traversal here and also in part because we have abdicated // support for automatons that contain an empty pattern. // // I suspect this code could be made a bit simpler by designing a // better buffer abstraction. // // But in general, this code is basically write-only. So you'll need // to go through it step-by-step to grok it. One of the key bits of // complexity is tracking a few different offsets. 'buffer_pos' is // where we are in the buffer for search. 'buffer_reported_pos' is the // position immediately following the last byte in the buffer that // we've returned to the caller. And 'absolute_pos' is the overall // current absolute position of the search in the entire stream, and // this is what match spans are reported in terms of. loop { if self.aut.is_match(self.sid) { let mat = self.get_match(); if let Some(r) = self.get_non_match_chunk(mat) { self.buffer_reported_pos += r.len(); let bytes = &self.buf.buffer()[r]; return Some(Ok(StreamChunk::NonMatch { bytes })); } self.sid = self.start; let r = self.get_match_chunk(mat); self.buffer_reported_pos += r.len(); let bytes = &self.buf.buffer()[r]; return Some(Ok(StreamChunk::Match { bytes, mat })); } if self.buffer_pos >= self.buf.buffer().len() { if let Some(r) = self.get_pre_roll_non_match_chunk() { self.buffer_reported_pos += r.len(); let bytes = &self.buf.buffer()[r]; return Some(Ok(StreamChunk::NonMatch { bytes })); } if self.buf.buffer().len() >= self.buf.min_buffer_len() { self.buffer_pos = self.buf.min_buffer_len(); self.buffer_reported_pos -= self.buf.buffer().len() - self.buf.min_buffer_len(); self.buf.roll(); } match self.buf.fill(&mut self.rdr) { Err(err) => return Some(Err(err)), Ok(true) => {} Ok(false) => { // We've hit EOF, but if there are still some // unreported bytes remaining, return them now. if let Some(r) = self.get_eof_non_match_chunk() { self.buffer_reported_pos += r.len(); let bytes = &self.buf.buffer()[r]; return Some(Ok(StreamChunk::NonMatch { bytes })); } // We've reported everything! return None; } } } let start = self.absolute_pos; for &byte in self.buf.buffer()[self.buffer_pos..].iter() { self.sid = self.aut.next_state(Anchored::No, self.sid, byte); self.absolute_pos += 1; if self.aut.is_match(self.sid) { break; } } self.buffer_pos += self.absolute_pos - start; } } /// Return a match chunk for the given match. It is assumed that the match /// ends at the current `buffer_pos`. fn get_match_chunk(&self, mat: Match) -> core::ops::Range { let start = self.buffer_pos - mat.len(); let end = self.buffer_pos; start..end } /// Return a non-match chunk, if necessary, just before reporting a match. /// This returns `None` if there is nothing to report. Otherwise, this /// assumes that the given match ends at the current `buffer_pos`. fn get_non_match_chunk( &self, mat: Match, ) -> Option> { let buffer_mat_start = self.buffer_pos - mat.len(); if buffer_mat_start > self.buffer_reported_pos { let start = self.buffer_reported_pos; let end = buffer_mat_start; return Some(start..end); } None } /// Look for any bytes that should be reported as a non-match just before /// rolling the buffer. /// /// Note that this only reports bytes up to `buffer.len() - /// min_buffer_len`, as it's not possible to know whether the bytes /// following that will participate in a match or not. fn get_pre_roll_non_match_chunk(&self) -> Option> { let end = self.buf.buffer().len().saturating_sub(self.buf.min_buffer_len()); if self.buffer_reported_pos < end { return Some(self.buffer_reported_pos..end); } None } /// Return any unreported bytes as a non-match up to the end of the buffer. /// /// This should only be called when the entire contents of the buffer have /// been searched and EOF has been hit when trying to fill the buffer. fn get_eof_non_match_chunk(&self) -> Option> { if self.buffer_reported_pos < self.buf.buffer().len() { return Some(self.buffer_reported_pos..self.buf.buffer().len()); } None } /// Return the match at the current position for the current state. /// /// This panics if `self.aut.is_match(self.sid)` isn't true. fn get_match(&self) -> Match { get_match(self.aut, self.sid, 0, self.absolute_pos) } } /// A single chunk yielded by the stream chunk iterator. /// /// The `'r` lifetime refers to the lifetime of the stream chunk iterator. #[cfg(feature = "std")] #[derive(Debug)] enum StreamChunk<'r> { /// A chunk that does not contain any matches. NonMatch { bytes: &'r [u8] }, /// A chunk that precisely contains a match. Match { bytes: &'r [u8], mat: Match }, } #[inline(never)] pub(crate) fn try_find_fwd( aut: &A, input: &Input<'_>, ) -> Result, MatchError> { if input.is_done() { return Ok(None); } let earliest = aut.match_kind().is_standard() || input.get_earliest(); if input.get_anchored().is_anchored() { try_find_fwd_imp(aut, input, None, Anchored::Yes, earliest) } else if let Some(pre) = aut.prefilter() { if earliest { try_find_fwd_imp(aut, input, Some(pre), Anchored::No, true) } else { try_find_fwd_imp(aut, input, Some(pre), Anchored::No, false) } } else { if earliest { try_find_fwd_imp(aut, input, None, Anchored::No, true) } else { try_find_fwd_imp(aut, input, None, Anchored::No, false) } } } #[inline(always)] fn try_find_fwd_imp( aut: &A, input: &Input<'_>, pre: Option<&Prefilter>, anchored: Anchored, earliest: bool, ) -> Result, MatchError> { let mut sid = aut.start_state(input.get_anchored())?; let mut at = input.start(); let mut mat = None; if aut.is_match(sid) { mat = Some(get_match(aut, sid, 0, at)); if earliest { return Ok(mat); } } if let Some(pre) = pre { match pre.find_in(input.haystack(), input.get_span()) { Candidate::None => return Ok(None), Candidate::Match(m) => return Ok(Some(m)), Candidate::PossibleStartOfMatch(i) => { at = i; } } } while at < input.end() { // I've tried unrolling this loop and eliding bounds checks, but no // matter what I did, I could not observe a consistent improvement on // any benchmark I could devise. (If someone wants to re-litigate this, // the way to do it is to add an 'next_state_unchecked' method to the // 'Automaton' trait with a default impl that uses 'next_state'. Then // use 'aut.next_state_unchecked' here and implement it on DFA using // unchecked slice index acces.) sid = aut.next_state(anchored, sid, input.haystack()[at]); if aut.is_special(sid) { if aut.is_dead(sid) { return Ok(mat); } else if aut.is_match(sid) { // We use 'at + 1' here because the match state is entered // at the last byte of the pattern. Since we use half-open // intervals, the end of the range of the match is one past the // last byte. let m = get_match(aut, sid, 0, at + 1); // For the automata in this crate, we make a size trade off // where we reuse the same automaton for both anchored and // unanchored searches. We achieve this, principally, by simply // not following failure transitions while computing the next // state. Instead, if we fail to find the next state, we return // a dead state, which instructs the search to stop. (This // is why 'next_state' needs to know whether the search is // anchored or not.) In addition, we have different start // states for anchored and unanchored searches. The latter has // a self-loop where as the former does not. // // In this way, we can use the same trie to execute both // anchored and unanchored searches. There is a catch though. // When building an Aho-Corasick automaton for unanchored // searches, we copy matches from match states to other states // (which would otherwise not be match states) if they are // reachable via a failure transition. In the case of an // anchored search, we *specifically* do not want to report // these matches because they represent matches that start past // the beginning of the search. // // Now we could tweak the automaton somehow to differentiate // anchored from unanchored match states, but this would make // 'aut.is_match' and potentially 'aut.is_special' slower. And // also make the automaton itself more complex. // // Instead, we insert a special hack: if the search is // anchored, we simply ignore matches that don't begin at // the start of the search. This is not quite ideal, but we // do specialize this function in such a way that unanchored // searches don't pay for this additional branch. While this // might cause a search to continue on for more than it // otherwise optimally would, it will be no more than the // longest pattern in the automaton. The reason for this is // that we ensure we don't follow failure transitions during // an anchored search. Combined with using a different anchored // starting state with no self-loop, we guarantee that we'll // at worst move through a number of transitions equal to the // longest pattern. // // Now for DFAs, the whole point of them is to eliminate // failure transitions entirely. So there is no way to say "if // it's an anchored search don't follow failure transitions." // Instead, we actually have to build two entirely separate // automatons into the transition table. One with failure // transitions built into it and another that is effectively // just an encoding of the base trie into a transition table. // DFAs still need this check though, because the match states // still carry matches only reachable via a failure transition. // Why? Because removing them seems difficult, although I // haven't given it a lot of thought. if !(anchored.is_anchored() && m.start() > input.start()) { mat = Some(m); if earliest { return Ok(mat); } } } else if let Some(pre) = pre { // If we're here, we know it's a special state that is not a // dead or a match state AND that a prefilter is active. Thus, // it must be a start state. debug_assert!(aut.is_start(sid)); // We don't care about 'Candidate::Match' here because if such // a match were possible, it would have been returned above // when we run the prefilter before walking the automaton. let span = Span::from(at..input.end()); match pre.find_in(input.haystack(), span).into_option() { None => return Ok(None), Some(i) => { if i > at { at = i; continue; } } } } else { // When pre.is_none(), then starting states should not be // treated as special. That is, without a prefilter, is_special // should only return true when the state is a dead or a match // state. // // It is possible to execute a search without a prefilter even // when the underlying searcher has one: an anchored search. // But in this case, the automaton makes it impossible to move // back to the start state by construction, and thus, we should // never reach this branch. debug_assert!(false, "unreachable"); } } at += 1; } Ok(mat) } #[inline(never)] fn try_find_overlapping_fwd( aut: &A, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { state.mat = None; if input.is_done() { return Ok(()); } // Searching with a pattern ID is always anchored, so we should only ever // use a prefilter when no pattern ID is given. if aut.prefilter().is_some() && !input.get_anchored().is_anchored() { let pre = aut.prefilter().unwrap(); try_find_overlapping_fwd_imp(aut, input, Some(pre), state) } else { try_find_overlapping_fwd_imp(aut, input, None, state) } } #[inline(always)] fn try_find_overlapping_fwd_imp( aut: &A, input: &Input<'_>, pre: Option<&Prefilter>, state: &mut OverlappingState, ) -> Result<(), MatchError> { let mut sid = match state.id { None => { let sid = aut.start_state(input.get_anchored())?; // Handle the case where the start state is a match state. That is, // the empty string is in our automaton. We report every match we // can here before moving on and updating 'state.at' and 'state.id' // to find more matches in other parts of the haystack. if aut.is_match(sid) { let i = state.next_match_index.unwrap_or(0); let len = aut.match_len(sid); if i < len { state.next_match_index = Some(i + 1); state.mat = Some(get_match(aut, sid, i, input.start())); return Ok(()); } } state.at = input.start(); state.id = Some(sid); state.next_match_index = None; state.mat = None; sid } Some(sid) => { // If we still have matches left to report in this state then // report them until we've exhausted them. Only after that do we // advance to the next offset in the haystack. if let Some(i) = state.next_match_index { let len = aut.match_len(sid); if i < len { state.next_match_index = Some(i + 1); state.mat = Some(get_match(aut, sid, i, state.at + 1)); return Ok(()); } // Once we've reported all matches at a given position, we need // to advance the search to the next position. state.at += 1; state.next_match_index = None; state.mat = None; } sid } }; while state.at < input.end() { sid = aut.next_state( input.get_anchored(), sid, input.haystack()[state.at], ); if aut.is_special(sid) { state.id = Some(sid); if aut.is_dead(sid) { return Ok(()); } else if aut.is_match(sid) { state.next_match_index = Some(1); state.mat = Some(get_match(aut, sid, 0, state.at + 1)); return Ok(()); } else if let Some(pre) = pre { // If we're here, we know it's a special state that is not a // dead or a match state AND that a prefilter is active. Thus, // it must be a start state. debug_assert!(aut.is_start(sid)); let span = Span::from(state.at..input.end()); match pre.find_in(input.haystack(), span).into_option() { None => return Ok(()), Some(i) => { if i > state.at { state.at = i; continue; } } } } else { // When pre.is_none(), then starting states should not be // treated as special. That is, without a prefilter, is_special // should only return true when the state is a dead or a match // state. // // ... except for one special case: in stream searching, we // currently call overlapping search with a 'None' prefilter, // regardless of whether one exists or not, because stream // searching can't currently deal with prefilters correctly in // all cases. } } state.at += 1; } state.id = Some(sid); Ok(()) } #[inline(always)] fn get_match( aut: &A, sid: StateID, index: usize, at: usize, ) -> Match { let pid = aut.match_pattern(sid, index); let len = aut.pattern_len(pid); Match::new(pid, (at - len)..at) } /// Write a prefix "state" indicator for fmt::Debug impls. It always writes /// exactly two printable bytes to the given formatter. /// /// Specifically, this tries to succinctly distinguish the different types of /// states: dead states, start states and match states. It even accounts for /// the possible overlappings of different state types. (The only possible /// overlapping is that of match and start states.) pub(crate) fn fmt_state_indicator( f: &mut core::fmt::Formatter<'_>, aut: A, id: StateID, ) -> core::fmt::Result { if aut.is_dead(id) { write!(f, "D ")?; } else if aut.is_match(id) { if aut.is_start(id) { write!(f, "*>")?; } else { write!(f, "* ")?; } } else if aut.is_start(id) { write!(f, " >")?; } else { write!(f, " ")?; } Ok(()) } /// Return an iterator of transitions in a sparse format given an iterator /// of all explicitly defined transitions. The iterator yields ranges of /// transitions, such that any adjacent transitions mapped to the same /// state are combined into a single range. pub(crate) fn sparse_transitions<'a>( mut it: impl Iterator + 'a, ) -> impl Iterator + 'a { let mut cur: Option<(u8, u8, StateID)> = None; core::iter::from_fn(move || { while let Some((class, next)) = it.next() { let (prev_start, prev_end, prev_next) = match cur { Some(x) => x, None => { cur = Some((class, class, next)); continue; } }; if prev_next == next { cur = Some((prev_start, class, prev_next)); } else { cur = Some((class, class, next)); return Some((prev_start, prev_end, prev_next)); } } if let Some((start, end, next)) = cur.take() { return Some((start, end, next)); } None }) } aho-corasick-1.1.3/src/dfa.rs000064400000000000000000000775541046102023000141020ustar 00000000000000/*! Provides direct access to a DFA implementation of Aho-Corasick. This is a low-level API that generally only needs to be used in niche circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of a DFA directly. Using an `DFA` directly is typically only necessary when one needs access to the [`Automaton`] trait implementation. */ use alloc::{vec, vec::Vec}; use crate::{ automaton::Automaton, nfa::noncontiguous, util::{ alphabet::ByteClasses, error::{BuildError, MatchError}, int::{Usize, U32}, prefilter::Prefilter, primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, search::{Anchored, MatchKind, StartKind}, special::Special, }, }; /// A DFA implementation of Aho-Corasick. /// /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of /// this type directly. Using a `DFA` directly is typically only necessary when /// one needs access to the [`Automaton`] trait implementation. /// /// This DFA can only be built by first constructing a [`noncontiguous::NFA`]. /// Both [`DFA::new`] and [`Builder::build`] do this for you automatically, but /// [`Builder::build_from_noncontiguous`] permits doing it explicitly. /// /// A DFA provides the best possible search performance (in this crate) via two /// mechanisms: /// /// * All states use a dense representation for their transitions. /// * All failure transitions are pre-computed such that they are never /// explicitly handled at search time. /// /// These two facts combined mean that every state transition is performed /// using a constant number of instructions. However, this comes at /// great cost. The memory usage of a DFA can be quite exorbitant. /// It is potentially multiple orders of magnitude greater than a /// [`contiguous::NFA`](crate::nfa::contiguous::NFA) for example. In exchange, /// a DFA will typically have better search speed than a `contiguous::NFA`, but /// not by orders of magnitude. /// /// Unless you have a small number of patterns or memory usage is not a concern /// and search performance is critical, a DFA is usually not the best choice. /// /// Moreover, unlike the NFAs in this crate, it is costly for a DFA to /// support for anchored and unanchored search configurations. Namely, /// since failure transitions are pre-computed, supporting both anchored /// and unanchored searches requires a duplication of the transition table, /// making the memory usage of such a DFA ever bigger. (The NFAs in this crate /// unconditionally support both anchored and unanchored searches because there /// is essentially no added cost for doing so.) It is for this reason that /// a DFA's support for anchored and unanchored searches can be configured /// via [`Builder::start_kind`]. By default, a DFA only supports unanchored /// searches. /// /// # Example /// /// This example shows how to build an `DFA` directly and use it to execute /// [`Automaton::try_find`]: /// /// ``` /// use aho_corasick::{ /// automaton::Automaton, /// dfa::DFA, /// Input, Match, /// }; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let nfa = DFA::new(patterns).unwrap(); /// assert_eq!( /// Some(Match::must(0, 1..2)), /// nfa.try_find(&Input::new(haystack))?, /// ); /// # Ok::<(), Box>(()) /// ``` /// /// It is also possible to implement your own version of `try_find`. See the /// [`Automaton`] documentation for an example. #[derive(Clone)] pub struct DFA { /// The DFA transition table. IDs in this table are pre-multiplied. So /// instead of the IDs being 0, 1, 2, 3, ..., they are 0*stride, 1*stride, /// 2*stride, 3*stride, ... trans: Vec, /// The matches for every match state in this DFA. This is first indexed by /// state index (so that's `sid >> stride2`) and then by order in which the /// matches are meant to occur. matches: Vec>, /// The amount of heap memory used, in bytes, by the inner Vecs of /// 'matches'. matches_memory_usage: usize, /// The length of each pattern. This is used to compute the start offset /// of a match. pattern_lens: Vec, /// A prefilter for accelerating searches, if one exists. prefilter: Option, /// The match semantics built into this DFA. match_kind: MatchKind, /// The total number of states in this DFA. state_len: usize, /// The alphabet size, or total number of equivalence classes, for this /// DFA. Note that the actual number of transitions in each state is /// stride=2^stride2, where stride is the smallest power of 2 greater than /// or equal to alphabet_len. We do things this way so that we can use /// bitshifting to go from a state ID to an index into 'matches'. alphabet_len: usize, /// The exponent with a base 2, such that stride=2^stride2. Given a state /// index 'i', its state identifier is 'i << stride2'. Given a state /// identifier 'sid', its state index is 'sid >> stride2'. stride2: usize, /// The equivalence classes for this DFA. All transitions are defined on /// equivalence classes and not on the 256 distinct byte values. byte_classes: ByteClasses, /// The length of the shortest pattern in this automaton. min_pattern_len: usize, /// The length of the longest pattern in this automaton. max_pattern_len: usize, /// The information required to deduce which states are "special" in this /// DFA. special: Special, } impl DFA { /// Create a new Aho-Corasick DFA using the default configuration. /// /// Use a [`Builder`] if you want to change the configuration. pub fn new(patterns: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, { DFA::builder().build(patterns) } /// A convenience method for returning a new Aho-Corasick DFA builder. /// /// This usually permits one to just import the `DFA` type. pub fn builder() -> Builder { Builder::new() } } impl DFA { /// A sentinel state ID indicating that a search should stop once it has /// entered this state. When a search stops, it returns a match if one has /// been found, otherwise no match. A DFA always has an actual dead state /// at this ID. /// /// N.B. DFAs, unlike NFAs, do not have any notion of a FAIL state. /// Namely, the whole point of a DFA is that the FAIL state is completely /// compiled away. That is, DFA construction involves pre-computing the /// failure transitions everywhere, such that failure transitions are no /// longer used at search time. This, combined with its uniformly dense /// representation, are the two most important factors in why it's faster /// than the NFAs in this crate. const DEAD: StateID = StateID::new_unchecked(0); /// Adds the given pattern IDs as matches to the given state and also /// records the added memory usage. fn set_matches( &mut self, sid: StateID, pids: impl Iterator, ) { let index = (sid.as_usize() >> self.stride2).checked_sub(2).unwrap(); let mut at_least_one = false; for pid in pids { self.matches[index].push(pid); self.matches_memory_usage += PatternID::SIZE; at_least_one = true; } assert!(at_least_one, "match state must have non-empty pids"); } } // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always // returns a valid state ID given a valid state ID. We otherwise claim that // all other methods are correct as well. unsafe impl Automaton for DFA { #[inline(always)] fn start_state(&self, anchored: Anchored) -> Result { // Either of the start state IDs can be DEAD, in which case, support // for that type of search is not provided by this DFA. Which start // state IDs are inactive depends on the 'StartKind' configuration at // DFA construction time. match anchored { Anchored::No => { let start = self.special.start_unanchored_id; if start == DFA::DEAD { Err(MatchError::invalid_input_unanchored()) } else { Ok(start) } } Anchored::Yes => { let start = self.special.start_anchored_id; if start == DFA::DEAD { Err(MatchError::invalid_input_anchored()) } else { Ok(start) } } } } #[inline(always)] fn next_state( &self, _anchored: Anchored, sid: StateID, byte: u8, ) -> StateID { let class = self.byte_classes.get(byte); self.trans[(sid.as_u32() + u32::from(class)).as_usize()] } #[inline(always)] fn is_special(&self, sid: StateID) -> bool { sid <= self.special.max_special_id } #[inline(always)] fn is_dead(&self, sid: StateID) -> bool { sid == DFA::DEAD } #[inline(always)] fn is_match(&self, sid: StateID) -> bool { !self.is_dead(sid) && sid <= self.special.max_match_id } #[inline(always)] fn is_start(&self, sid: StateID) -> bool { sid == self.special.start_unanchored_id || sid == self.special.start_anchored_id } #[inline(always)] fn match_kind(&self) -> MatchKind { self.match_kind } #[inline(always)] fn patterns_len(&self) -> usize { self.pattern_lens.len() } #[inline(always)] fn pattern_len(&self, pid: PatternID) -> usize { self.pattern_lens[pid].as_usize() } #[inline(always)] fn min_pattern_len(&self) -> usize { self.min_pattern_len } #[inline(always)] fn max_pattern_len(&self) -> usize { self.max_pattern_len } #[inline(always)] fn match_len(&self, sid: StateID) -> usize { debug_assert!(self.is_match(sid)); let offset = (sid.as_usize() >> self.stride2) - 2; self.matches[offset].len() } #[inline(always)] fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { debug_assert!(self.is_match(sid)); let offset = (sid.as_usize() >> self.stride2) - 2; self.matches[offset][index] } #[inline(always)] fn memory_usage(&self) -> usize { use core::mem::size_of; (self.trans.len() * size_of::()) + (self.matches.len() * size_of::>()) + self.matches_memory_usage + (self.pattern_lens.len() * size_of::()) + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) } #[inline(always)] fn prefilter(&self) -> Option<&Prefilter> { self.prefilter.as_ref() } } impl core::fmt::Debug for DFA { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use crate::{ automaton::{fmt_state_indicator, sparse_transitions}, util::debug::DebugByte, }; writeln!(f, "dfa::DFA(")?; for index in 0..self.state_len { let sid = StateID::new_unchecked(index << self.stride2); // While we do currently include the FAIL state in the transition // table (to simplify construction), it is never actually used. It // poses problems with the code below because it gets treated as // a match state incidentally when it is, of course, not. So we // special case it. The fail state is always the first state after // the dead state. // // If the construction is changed to remove the fail state (it // probably should be), then this special case should be updated. if index == 1 { writeln!(f, "F {:06}:", sid.as_usize())?; continue; } fmt_state_indicator(f, self, sid)?; write!(f, "{:06}: ", sid.as_usize())?; let it = (0..self.byte_classes.alphabet_len()).map(|class| { (class.as_u8(), self.trans[sid.as_usize() + class]) }); for (i, (start, end, next)) in sparse_transitions(it).enumerate() { if i > 0 { write!(f, ", ")?; } if start == end { write!( f, "{:?} => {:?}", DebugByte(start), next.as_usize() )?; } else { write!( f, "{:?}-{:?} => {:?}", DebugByte(start), DebugByte(end), next.as_usize() )?; } } write!(f, "\n")?; if self.is_match(sid) { write!(f, " matches: ")?; for i in 0..self.match_len(sid) { if i > 0 { write!(f, ", ")?; } let pid = self.match_pattern(sid, i); write!(f, "{}", pid.as_usize())?; } write!(f, "\n")?; } } writeln!(f, "match kind: {:?}", self.match_kind)?; writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?; writeln!(f, "state length: {:?}", self.state_len)?; writeln!(f, "pattern length: {:?}", self.patterns_len())?; writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?; writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?; writeln!(f, "alphabet length: {:?}", self.alphabet_len)?; writeln!(f, "stride: {:?}", 1 << self.stride2)?; writeln!(f, "byte classes: {:?}", self.byte_classes)?; writeln!(f, "memory usage: {:?}", self.memory_usage())?; writeln!(f, ")")?; Ok(()) } } /// A builder for configuring an Aho-Corasick DFA. /// /// This builder has a subset of the options available to a /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, /// their behavior is identical. #[derive(Clone, Debug)] pub struct Builder { noncontiguous: noncontiguous::Builder, start_kind: StartKind, byte_classes: bool, } impl Default for Builder { fn default() -> Builder { Builder { noncontiguous: noncontiguous::Builder::new(), start_kind: StartKind::Unanchored, byte_classes: true, } } } impl Builder { /// Create a new builder for configuring an Aho-Corasick DFA. pub fn new() -> Builder { Builder::default() } /// Build an Aho-Corasick DFA from the given iterator of patterns. /// /// A builder may be reused to create more DFAs. pub fn build(&self, patterns: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, { let nnfa = self.noncontiguous.build(patterns)?; self.build_from_noncontiguous(&nnfa) } /// Build an Aho-Corasick DFA from the given noncontiguous NFA. /// /// Note that when this method is used, only the `start_kind` and /// `byte_classes` settings on this builder are respected. The other /// settings only apply to the initial construction of the Aho-Corasick /// automaton. Since using this method requires that initial construction /// has already completed, all settings impacting only initial construction /// are no longer relevant. pub fn build_from_noncontiguous( &self, nnfa: &noncontiguous::NFA, ) -> Result { debug!("building DFA"); let byte_classes = if self.byte_classes { nnfa.byte_classes().clone() } else { ByteClasses::singletons() }; let state_len = match self.start_kind { StartKind::Unanchored | StartKind::Anchored => nnfa.states().len(), StartKind::Both => { // These unwraps are OK because we know that the number of // NFA states is < StateID::LIMIT which is in turn less than // i32::MAX. Thus, there is always room to multiply by 2. // Finally, the number of states is always at least 4 in the // NFA (DEAD, FAIL, START-UNANCHORED, START-ANCHORED), so the // subtraction of 4 is okay. // // Note that we subtract 4 because the "anchored" part of // the DFA duplicates the unanchored part (without failure // transitions), but reuses the DEAD, FAIL and START states. nnfa.states() .len() .checked_mul(2) .unwrap() .checked_sub(4) .unwrap() } }; let trans_len = match state_len.checked_shl(byte_classes.stride2().as_u32()) { Some(trans_len) => trans_len, None => { return Err(BuildError::state_id_overflow( StateID::MAX.as_u64(), usize::MAX.as_u64(), )) } }; StateID::new(trans_len.checked_sub(byte_classes.stride()).unwrap()) .map_err(|e| { BuildError::state_id_overflow( StateID::MAX.as_u64(), e.attempted(), ) })?; let num_match_states = match self.start_kind { StartKind::Unanchored | StartKind::Anchored => { nnfa.special().max_match_id.as_usize().checked_sub(1).unwrap() } StartKind::Both => nnfa .special() .max_match_id .as_usize() .checked_sub(1) .unwrap() .checked_mul(2) .unwrap(), }; let mut dfa = DFA { trans: vec![DFA::DEAD; trans_len], matches: vec![vec![]; num_match_states], matches_memory_usage: 0, pattern_lens: nnfa.pattern_lens_raw().to_vec(), prefilter: nnfa.prefilter().map(|p| p.clone()), match_kind: nnfa.match_kind(), state_len, alphabet_len: byte_classes.alphabet_len(), stride2: byte_classes.stride2(), byte_classes, min_pattern_len: nnfa.min_pattern_len(), max_pattern_len: nnfa.max_pattern_len(), // The special state IDs are set later. special: Special::zero(), }; match self.start_kind { StartKind::Both => { self.finish_build_both_starts(nnfa, &mut dfa); } StartKind::Unanchored => { self.finish_build_one_start(Anchored::No, nnfa, &mut dfa); } StartKind::Anchored => { self.finish_build_one_start(Anchored::Yes, nnfa, &mut dfa) } } debug!( "DFA built, ", dfa.state_len, dfa.memory_usage(), dfa.byte_classes.alphabet_len(), dfa.byte_classes.stride(), ); // The vectors can grow ~twice as big during construction because a // Vec amortizes growth. But here, let's shrink things back down to // what we actually need since we're never going to add more to it. dfa.trans.shrink_to_fit(); dfa.pattern_lens.shrink_to_fit(); dfa.matches.shrink_to_fit(); // TODO: We might also want to shrink each Vec inside of `dfa.matches`, // or even better, convert it to one contiguous allocation. But I think // I went with nested allocs for good reason (can't remember), so this // may be tricky to do. I decided not to shrink them here because it // might require a fair bit of work to do. It's unclear whether it's // worth it. Ok(dfa) } /// Finishes building a DFA for either unanchored or anchored searches, /// but NOT both. fn finish_build_one_start( &self, anchored: Anchored, nnfa: &noncontiguous::NFA, dfa: &mut DFA, ) { // This function always succeeds because we check above that all of the // states in the NFA can be mapped to DFA state IDs. let stride2 = dfa.stride2; let old2new = |oldsid: StateID| { StateID::new_unchecked(oldsid.as_usize() << stride2) }; for (oldsid, state) in nnfa.states().iter().with_state_ids() { let newsid = old2new(oldsid); if state.is_match() { dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); } sparse_iter( nnfa, oldsid, &dfa.byte_classes, |byte, class, mut oldnextsid| { if oldnextsid == noncontiguous::NFA::FAIL { if anchored.is_anchored() { oldnextsid = noncontiguous::NFA::DEAD; } else if state.fail() == noncontiguous::NFA::DEAD { // This is a special case that avoids following // DEAD transitions in a non-contiguous NFA. // Following these transitions is pretty slow // because the non-contiguous NFA will always use // a sparse representation for it (because the // DEAD state is usually treated as a sentinel). // The *vast* majority of failure states are DEAD // states, so this winds up being pretty slow if // we go through the non-contiguous NFA state // transition logic. Instead, just do it ourselves. oldnextsid = noncontiguous::NFA::DEAD; } else { oldnextsid = nnfa.next_state( Anchored::No, state.fail(), byte, ); } } dfa.trans[newsid.as_usize() + usize::from(class)] = old2new(oldnextsid); }, ); } // Now that we've remapped all the IDs in our states, all that's left // is remapping the special state IDs. let old = nnfa.special(); let new = &mut dfa.special; new.max_special_id = old2new(old.max_special_id); new.max_match_id = old2new(old.max_match_id); if anchored.is_anchored() { new.start_unanchored_id = DFA::DEAD; new.start_anchored_id = old2new(old.start_anchored_id); } else { new.start_unanchored_id = old2new(old.start_unanchored_id); new.start_anchored_id = DFA::DEAD; } } /// Finishes building a DFA that supports BOTH unanchored and anchored /// searches. It works by inter-leaving unanchored states with anchored /// states in the same transition table. This way, we avoid needing to /// re-shuffle states afterward to ensure that our states still look like /// DEAD, MATCH, ..., START-UNANCHORED, START-ANCHORED, NON-MATCH, ... /// /// Honestly this is pretty inscrutable... Simplifications are most /// welcome. fn finish_build_both_starts( &self, nnfa: &noncontiguous::NFA, dfa: &mut DFA, ) { let stride2 = dfa.stride2; let stride = 1 << stride2; let mut remap_unanchored = vec![DFA::DEAD; nnfa.states().len()]; let mut remap_anchored = vec![DFA::DEAD; nnfa.states().len()]; let mut is_anchored = vec![false; dfa.state_len]; let mut newsid = DFA::DEAD; let next_dfa_id = |sid: StateID| StateID::new_unchecked(sid.as_usize() + stride); for (oldsid, state) in nnfa.states().iter().with_state_ids() { if oldsid == noncontiguous::NFA::DEAD || oldsid == noncontiguous::NFA::FAIL { remap_unanchored[oldsid] = newsid; remap_anchored[oldsid] = newsid; newsid = next_dfa_id(newsid); } else if oldsid == nnfa.special().start_unanchored_id || oldsid == nnfa.special().start_anchored_id { if oldsid == nnfa.special().start_unanchored_id { remap_unanchored[oldsid] = newsid; remap_anchored[oldsid] = DFA::DEAD; } else { remap_unanchored[oldsid] = DFA::DEAD; remap_anchored[oldsid] = newsid; is_anchored[newsid.as_usize() >> stride2] = true; } if state.is_match() { dfa.set_matches(newsid, nnfa.iter_matches(oldsid)); } sparse_iter( nnfa, oldsid, &dfa.byte_classes, |_, class, oldnextsid| { let class = usize::from(class); if oldnextsid == noncontiguous::NFA::FAIL { dfa.trans[newsid.as_usize() + class] = DFA::DEAD; } else { dfa.trans[newsid.as_usize() + class] = oldnextsid; } }, ); newsid = next_dfa_id(newsid); } else { let unewsid = newsid; newsid = next_dfa_id(newsid); let anewsid = newsid; newsid = next_dfa_id(newsid); remap_unanchored[oldsid] = unewsid; remap_anchored[oldsid] = anewsid; is_anchored[anewsid.as_usize() >> stride2] = true; if state.is_match() { dfa.set_matches(unewsid, nnfa.iter_matches(oldsid)); dfa.set_matches(anewsid, nnfa.iter_matches(oldsid)); } sparse_iter( nnfa, oldsid, &dfa.byte_classes, |byte, class, oldnextsid| { let class = usize::from(class); if oldnextsid == noncontiguous::NFA::FAIL { let oldnextsid = if state.fail() == noncontiguous::NFA::DEAD { noncontiguous::NFA::DEAD } else { nnfa.next_state( Anchored::No, state.fail(), byte, ) }; dfa.trans[unewsid.as_usize() + class] = oldnextsid; } else { dfa.trans[unewsid.as_usize() + class] = oldnextsid; dfa.trans[anewsid.as_usize() + class] = oldnextsid; } }, ); } } for i in 0..dfa.state_len { let sid = i << stride2; if is_anchored[i] { for next in dfa.trans[sid..][..stride].iter_mut() { *next = remap_anchored[*next]; } } else { for next in dfa.trans[sid..][..stride].iter_mut() { *next = remap_unanchored[*next]; } } } // Now that we've remapped all the IDs in our states, all that's left // is remapping the special state IDs. let old = nnfa.special(); let new = &mut dfa.special; new.max_special_id = remap_anchored[old.max_special_id]; new.max_match_id = remap_anchored[old.max_match_id]; new.start_unanchored_id = remap_unanchored[old.start_unanchored_id]; new.start_anchored_id = remap_anchored[old.start_anchored_id]; } /// Set the desired match semantics. /// /// This only applies when using [`Builder::build`] and not /// [`Builder::build_from_noncontiguous`]. /// /// See /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) /// for more documentation and examples. pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { self.noncontiguous.match_kind(kind); self } /// Enable ASCII-aware case insensitive matching. /// /// This only applies when using [`Builder::build`] and not /// [`Builder::build_from_noncontiguous`]. /// /// See /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) /// for more documentation and examples. pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { self.noncontiguous.ascii_case_insensitive(yes); self } /// Enable heuristic prefilter optimizations. /// /// This only applies when using [`Builder::build`] and not /// [`Builder::build_from_noncontiguous`]. /// /// See /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) /// for more documentation and examples. pub fn prefilter(&mut self, yes: bool) -> &mut Builder { self.noncontiguous.prefilter(yes); self } /// Sets the starting state configuration for the automaton. /// /// See /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) /// for more documentation and examples. pub fn start_kind(&mut self, kind: StartKind) -> &mut Builder { self.start_kind = kind; self } /// A debug setting for whether to attempt to shrink the size of the /// automaton's alphabet or not. /// /// This should never be enabled unless you're debugging an automaton. /// Namely, disabling byte classes makes transitions easier to reason /// about, since they use the actual bytes instead of equivalence classes. /// Disabling this confers no performance benefit at search time. /// /// See /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes) /// for more documentation and examples. pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { self.byte_classes = yes; self } } /// Iterate over all possible equivalence class transitions in this state. /// The closure is called for all transitions with a distinct equivalence /// class, even those not explicitly represented in this sparse state. For /// any implicitly defined transitions, the given closure is called with /// the fail state ID. /// /// The closure is guaranteed to be called precisely /// `byte_classes.alphabet_len()` times, once for every possible class in /// ascending order. fn sparse_iter( nnfa: &noncontiguous::NFA, oldsid: StateID, classes: &ByteClasses, mut f: F, ) { let mut prev_class = None; let mut byte = 0usize; for t in nnfa.iter_trans(oldsid) { while byte < usize::from(t.byte()) { let rep = byte.as_u8(); let class = classes.get(rep); byte += 1; if prev_class != Some(class) { f(rep, class, noncontiguous::NFA::FAIL); prev_class = Some(class); } } let rep = t.byte(); let class = classes.get(rep); byte += 1; if prev_class != Some(class) { f(rep, class, t.next()); prev_class = Some(class); } } for b in byte..=255 { let rep = b.as_u8(); let class = classes.get(rep); if prev_class != Some(class) { f(rep, class, noncontiguous::NFA::FAIL); prev_class = Some(class); } } } aho-corasick-1.1.3/src/lib.rs000064400000000000000000000271741046102023000141070ustar 00000000000000/*! A library for finding occurrences of many patterns at once. This library provides multiple pattern search principally through an implementation of the [Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), which builds a fast finite state machine for executing searches in linear time. Additionally, this library provides a number of configuration options for building the automaton that permit controlling the space versus time trade off. Other features include simple ASCII case insensitive matching, finding overlapping matches, replacements, searching streams and even searching and replacing text in streams. Finally, unlike most other Aho-Corasick implementations, this one supports enabling [leftmost-first](MatchKind::LeftmostFirst) or [leftmost-longest](MatchKind::LeftmostLongest) match semantics, using a (seemingly) novel alternative construction algorithm. For more details on what match semantics means, see the [`MatchKind`] type. # Overview This section gives a brief overview of the primary types in this crate: * [`AhoCorasick`] is the primary type and represents an Aho-Corasick automaton. This is the type you use to execute searches. * [`AhoCorasickBuilder`] can be used to build an Aho-Corasick automaton, and supports configuring a number of options. * [`Match`] represents a single match reported by an Aho-Corasick automaton. Each match has two pieces of information: the pattern that matched and the start and end byte offsets corresponding to the position in the haystack at which it matched. # Example: basic searching This example shows how to search for occurrences of multiple patterns simultaneously. Each match includes the pattern that matched along with the byte offsets of the match. ``` use aho_corasick::{AhoCorasick, PatternID}; let patterns = &["apple", "maple", "Snapple"]; let haystack = "Nobody likes maple in their apple flavored Snapple."; let ac = AhoCorasick::new(patterns).unwrap(); let mut matches = vec![]; for mat in ac.find_iter(haystack) { matches.push((mat.pattern(), mat.start(), mat.end())); } assert_eq!(matches, vec![ (PatternID::must(1), 13, 18), (PatternID::must(0), 28, 33), (PatternID::must(2), 43, 50), ]); ``` # Example: case insensitivity This is like the previous example, but matches `Snapple` case insensitively using `AhoCorasickBuilder`: ``` use aho_corasick::{AhoCorasick, PatternID}; let patterns = &["apple", "maple", "snapple"]; let haystack = "Nobody likes maple in their apple flavored Snapple."; let ac = AhoCorasick::builder() .ascii_case_insensitive(true) .build(patterns) .unwrap(); let mut matches = vec![]; for mat in ac.find_iter(haystack) { matches.push((mat.pattern(), mat.start(), mat.end())); } assert_eq!(matches, vec![ (PatternID::must(1), 13, 18), (PatternID::must(0), 28, 33), (PatternID::must(2), 43, 50), ]); ``` # Example: replacing matches in a stream This example shows how to execute a search and replace on a stream without loading the entire stream into memory first. ``` # #[cfg(feature = "std")] { use aho_corasick::AhoCorasick; # fn example() -> Result<(), std::io::Error> { let patterns = &["fox", "brown", "quick"]; let replace_with = &["sloth", "grey", "slow"]; // In a real example, these might be `std::fs::File`s instead. All you need to // do is supply a pair of `std::io::Read` and `std::io::Write` implementations. let rdr = "The quick brown fox."; let mut wtr = vec![]; let ac = AhoCorasick::new(patterns).unwrap(); ac.try_stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?; assert_eq!(b"The slow grey sloth.".to_vec(), wtr); # Ok(()) }; example().unwrap() # } ``` # Example: finding the leftmost first match In the textbook description of Aho-Corasick, its formulation is typically structured such that it reports all possible matches, even when they overlap with another. In many cases, overlapping matches may not be desired, such as the case of finding all successive non-overlapping matches like you might with a standard regular expression. Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do this doesn't always work in the expected way, since it will report matches as soon as they are seen. For example, consider matching the regex `Samwise|Sam` against the text `Samwise`. Most regex engines (that are Perl-like, or non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick algorithm modified for reporting non-overlapping matches will report `Sam`. A novel contribution of this library is the ability to change the match semantics of Aho-Corasick (without additional search time overhead) such that `Samwise` is reported instead. For example, here's the standard approach: ``` use aho_corasick::AhoCorasick; let patterns = &["Samwise", "Sam"]; let haystack = "Samwise"; let ac = AhoCorasick::new(patterns).unwrap(); let mat = ac.find(haystack).expect("should have a match"); assert_eq!("Sam", &haystack[mat.start()..mat.end()]); ``` And now here's the leftmost-first version, which matches how a Perl-like regex will work: ``` use aho_corasick::{AhoCorasick, MatchKind}; let patterns = &["Samwise", "Sam"]; let haystack = "Samwise"; let ac = AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) .build(patterns) .unwrap(); let mat = ac.find(haystack).expect("should have a match"); assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); ``` In addition to leftmost-first semantics, this library also supports leftmost-longest semantics, which match the POSIX behavior of a regular expression alternation. See [`MatchKind`] for more details. # Prefilters While an Aho-Corasick automaton can perform admirably when compared to more naive solutions, it is generally slower than more specialized algorithms that are accelerated using vector instructions such as SIMD. For that reason, this library will internally use a "prefilter" to attempt to accelerate searches when possible. Currently, this library has several different algorithms it might use depending on the patterns provided. Once the number of patterns gets too big, prefilters are no longer used. While a prefilter is generally good to have on by default since it works well in the common case, it can lead to less predictable or even sub-optimal performance in some cases. For that reason, prefilters can be explicitly disabled via [`AhoCorasickBuilder::prefilter`]. # Lower level APIs This crate also provides several sub-modules that collectively expose many of the implementation details of the main [`AhoCorasick`] type. Most users of this library can completely ignore the submodules and their contents, but if you needed finer grained control, some parts of them may be useful to you. Here is a brief overview of each and why you might want to use them: * The [`packed`] sub-module contains a lower level API for using fast vectorized routines for finding a small number of patterns in a haystack. You might want to use this API when you want to completely side-step using Aho-Corasick automata. Otherwise, the fast vectorized routines are used automatically as prefilters for `AhoCorasick` searches whenever possible. * The [`automaton`] sub-module provides a lower level finite state machine interface that the various Aho-Corasick implementations in this crate implement. This sub-module's main contribution is the [`Automaton`](automaton::Automaton) trait, which permits manually walking the state transitions of an Aho-Corasick automaton. * The [`dfa`] and [`nfa`] sub-modules provide DFA and NFA implementations of the aforementioned `Automaton` trait. The main reason one might want to use these sub-modules is to get access to a type that implements the `Automaton` trait. (The top-level `AhoCorasick` type does not implement the `Automaton` trait.) As mentioned above, if you aren't sure whether you need these sub-modules, you should be able to safely ignore them and just focus on the [`AhoCorasick`] type. # Crate features This crate exposes a few features for controlling dependency usage and whether this crate can be used without the standard library. * **std** - Enables support for the standard library. This feature is enabled by default. When disabled, only `core` and `alloc` are used. At an API level, enabling `std` enables `std::error::Error` trait impls for the various error types, and higher level stream search routines such as [`AhoCorasick::try_stream_find_iter`]. But the `std` feature is also required to enable vectorized prefilters. Prefilters can greatly accelerate searches, but generally only apply when the number of patterns is small (less than ~100). * **perf-literal** - Enables support for literal prefilters that use vectorized routines from external crates. This feature is enabled by default. If you're only using Aho-Corasick for large numbers of patterns or otherwise can abide lower throughput when searching with a small number of patterns, then it is reasonable to disable this feature. * **logging** - Enables a dependency on the `log` crate and emits messages to aide in diagnostics. This feature is disabled by default. */ #![no_std] #![deny(missing_docs)] #![deny(rustdoc::broken_intra_doc_links)] #![cfg_attr(docsrs, feature(doc_auto_cfg))] extern crate alloc; #[cfg(any(test, feature = "std"))] extern crate std; #[cfg(doctest)] doc_comment::doctest!("../README.md"); #[cfg(feature = "std")] pub use crate::ahocorasick::StreamFindIter; pub use crate::{ ahocorasick::{ AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, FindIter, FindOverlappingIter, }, util::{ error::{BuildError, MatchError, MatchErrorKind}, primitives::{PatternID, PatternIDError}, search::{Anchored, Input, Match, MatchKind, Span, StartKind}, }, }; #[macro_use] mod macros; mod ahocorasick; pub mod automaton; pub mod dfa; pub mod nfa; pub mod packed; #[cfg(test)] mod tests; // I wrote out the module for implementing fst::Automaton only to later realize // that this would make fst a public dependency and fst is not at 1.0 yet. I // decided to just keep the code in tree, but build it only during tests. // // TODO: I think I've changed my mind again. I'm considering pushing it out // into either a separate crate or into 'fst' directly as an optional feature. // #[cfg(test)] // #[allow(dead_code)] // mod transducer; pub(crate) mod util; #[cfg(test)] mod testoibits { use std::panic::{RefUnwindSafe, UnwindSafe}; use super::*; fn assert_all() {} #[test] fn oibits_main() { assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); } #[test] fn oibits_automaton() { use crate::{automaton, dfa::DFA}; assert_all::>(); assert_all::>(); #[cfg(feature = "std")] assert_all::>(); assert_all::(); assert_all::(); assert_all::(); } #[test] fn oibits_packed() { use crate::packed; assert_all::(); assert_all::(); assert_all::(); assert_all::(); assert_all::(); } } aho-corasick-1.1.3/src/macros.rs000064400000000000000000000004551046102023000146160ustar 00000000000000#![allow(unused_macros)] macro_rules! log { ($($tt:tt)*) => { #[cfg(feature = "logging")] { $($tt)* } } } macro_rules! debug { ($($tt:tt)*) => { log!(log::debug!($($tt)*)) } } macro_rules! trace { ($($tt:tt)*) => { log!(log::trace!($($tt)*)) } } aho-corasick-1.1.3/src/nfa/contiguous.rs000064400000000000000000001307021046102023000162740ustar 00000000000000/*! Provides a contiguous NFA implementation of Aho-Corasick. This is a low-level API that generally only needs to be used in niche circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of a contiguous NFA directly. Using an `NFA` directly is typically only necessary when one needs access to the [`Automaton`] trait implementation. */ use alloc::{vec, vec::Vec}; use crate::{ automaton::Automaton, nfa::noncontiguous, util::{ alphabet::ByteClasses, error::{BuildError, MatchError}, int::{Usize, U16, U32}, prefilter::Prefilter, primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, search::{Anchored, MatchKind}, special::Special, }, }; /// A contiguous NFA implementation of Aho-Corasick. /// /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of /// this type directly. Using an `NFA` directly is typically only necessary /// when one needs access to the [`Automaton`] trait implementation. /// /// This NFA can only be built by first constructing a [`noncontiguous::NFA`]. /// Both [`NFA::new`] and [`Builder::build`] do this for you automatically, but /// [`Builder::build_from_noncontiguous`] permits doing it explicitly. /// /// The main difference between a noncontiguous NFA and a contiguous NFA is /// that the latter represents all of its states and transitions in a single /// allocation, where as the former uses a separate allocation for each state. /// Doing this at construction time while keeping a low memory footprint isn't /// feasible, which is primarily why there are two different NFA types: one /// that does the least amount of work possible to build itself, and another /// that does a little extra work to compact itself and make state transitions /// faster by making some states use a dense representation. /// /// Because a contiguous NFA uses a single allocation, there is a lot more /// opportunity for compression tricks to reduce the heap memory used. Indeed, /// it is not uncommon for a contiguous NFA to use an order of magnitude less /// heap memory than a noncontiguous NFA. Since building a contiguous NFA /// usually only takes a fraction of the time it takes to build a noncontiguous /// NFA, the overall build time is not much slower. Thus, in most cases, a /// contiguous NFA is the best choice. /// /// Since a contiguous NFA uses various tricks for compression and to achieve /// faster state transitions, currently, its limit on the number of states /// is somewhat smaller than what a noncontiguous NFA can achieve. Generally /// speaking, you shouldn't expect to run into this limit if the number of /// patterns is under 1 million. It is plausible that this limit will be /// increased in the future. If the limit is reached, building a contiguous NFA /// will return an error. Often, since building a contiguous NFA is relatively /// cheap, it can make sense to always try it even if you aren't sure if it /// will fail or not. If it does, you can always fall back to a noncontiguous /// NFA. (Indeed, the main [`AhoCorasick`](crate::AhoCorasick) type employs a /// strategy similar to this at construction time.) /// /// # Example /// /// This example shows how to build an `NFA` directly and use it to execute /// [`Automaton::try_find`]: /// /// ``` /// use aho_corasick::{ /// automaton::Automaton, /// nfa::contiguous::NFA, /// Input, Match, /// }; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let nfa = NFA::new(patterns).unwrap(); /// assert_eq!( /// Some(Match::must(0, 1..2)), /// nfa.try_find(&Input::new(haystack))?, /// ); /// # Ok::<(), Box>(()) /// ``` /// /// It is also possible to implement your own version of `try_find`. See the /// [`Automaton`] documentation for an example. #[derive(Clone)] pub struct NFA { /// The raw NFA representation. Each state is packed with a header /// (containing the format of the state, the failure transition and, for /// a sparse state, the number of transitions), its transitions and any /// matching pattern IDs for match states. repr: Vec, /// The length of each pattern. This is used to compute the start offset /// of a match. pattern_lens: Vec, /// The total number of states in this NFA. state_len: usize, /// A prefilter for accelerating searches, if one exists. prefilter: Option, /// The match semantics built into this NFA. match_kind: MatchKind, /// The alphabet size, or total number of equivalence classes, for this /// NFA. Dense states always have this many transitions. alphabet_len: usize, /// The equivalence classes for this NFA. All transitions, dense and /// sparse, are defined on equivalence classes and not on the 256 distinct /// byte values. byte_classes: ByteClasses, /// The length of the shortest pattern in this automaton. min_pattern_len: usize, /// The length of the longest pattern in this automaton. max_pattern_len: usize, /// The information required to deduce which states are "special" in this /// NFA. special: Special, } impl NFA { /// Create a new Aho-Corasick contiguous NFA using the default /// configuration. /// /// Use a [`Builder`] if you want to change the configuration. pub fn new(patterns: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, { NFA::builder().build(patterns) } /// A convenience method for returning a new Aho-Corasick contiguous NFA /// builder. /// /// This usually permits one to just import the `NFA` type. pub fn builder() -> Builder { Builder::new() } } impl NFA { /// A sentinel state ID indicating that a search should stop once it has /// entered this state. When a search stops, it returns a match if one /// has been found, otherwise no match. A contiguous NFA always has an /// actual dead state at this ID. const DEAD: StateID = StateID::new_unchecked(0); /// Another sentinel state ID indicating that a search should move through /// current state's failure transition. /// /// Note that unlike DEAD, this does not actually point to a valid state /// in a contiguous NFA. (noncontiguous::NFA::FAIL does point to a valid /// state.) Instead, this points to the position that is guaranteed to /// never be a valid state ID (by making sure it points to a place in the /// middle of the encoding of the DEAD state). Since we never need to /// actually look at the FAIL state itself, this works out. /// /// By why do it this way? So that FAIL is a constant. I don't have any /// concrete evidence that this materially helps matters, but it's easy to /// do. The alternative would be making the FAIL ID point to the second /// state, which could be made a constant but is a little trickier to do. /// The easiest path is to just make the FAIL state a runtime value, but /// since comparisons with FAIL occur in perf critical parts of the search, /// we want it to be as tight as possible and not waste any registers. /// /// Very hand wavy... But the code complexity that results from this is /// very mild. const FAIL: StateID = StateID::new_unchecked(1); } // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always // returns a valid state ID given a valid state ID. We otherwise claim that // all other methods are correct as well. unsafe impl Automaton for NFA { #[inline(always)] fn start_state(&self, anchored: Anchored) -> Result { match anchored { Anchored::No => Ok(self.special.start_unanchored_id), Anchored::Yes => Ok(self.special.start_anchored_id), } } #[inline(always)] fn next_state( &self, anchored: Anchored, mut sid: StateID, byte: u8, ) -> StateID { let repr = &self.repr; let class = self.byte_classes.get(byte); let u32tosid = StateID::from_u32_unchecked; loop { let o = sid.as_usize(); let kind = repr[o] & 0xFF; // I tried to encapsulate the "next transition" logic into its own // function, but it seemed to always result in sub-optimal codegen // that led to real and significant slowdowns. So we just inline // the logic here. // // I've also tried a lot of different ways to speed up this // routine, and most of them have failed. if kind == State::KIND_DENSE { let next = u32tosid(repr[o + 2 + usize::from(class)]); if next != NFA::FAIL { return next; } } else if kind == State::KIND_ONE { if class == repr[o].low_u16().high_u8() { return u32tosid(repr[o + 2]); } } else { // NOTE: I tried a SWAR technique in the loop below, but found // it slower. See the 'swar' test in the tests for this module. let trans_len = kind.as_usize(); let classes_len = u32_len(trans_len); let trans_offset = o + 2 + classes_len; for (i, &chunk) in repr[o + 2..][..classes_len].iter().enumerate() { let classes = chunk.to_ne_bytes(); if classes[0] == class { return u32tosid(repr[trans_offset + i * 4]); } if classes[1] == class { return u32tosid(repr[trans_offset + i * 4 + 1]); } if classes[2] == class { return u32tosid(repr[trans_offset + i * 4 + 2]); } if classes[3] == class { return u32tosid(repr[trans_offset + i * 4 + 3]); } } } // For an anchored search, we never follow failure transitions // because failure transitions lead us down a path to matching // a *proper* suffix of the path we were on. Thus, it can only // produce matches that appear after the beginning of the search. if anchored.is_anchored() { return NFA::DEAD; } sid = u32tosid(repr[o + 1]); } } #[inline(always)] fn is_special(&self, sid: StateID) -> bool { sid <= self.special.max_special_id } #[inline(always)] fn is_dead(&self, sid: StateID) -> bool { sid == NFA::DEAD } #[inline(always)] fn is_match(&self, sid: StateID) -> bool { !self.is_dead(sid) && sid <= self.special.max_match_id } #[inline(always)] fn is_start(&self, sid: StateID) -> bool { sid == self.special.start_unanchored_id || sid == self.special.start_anchored_id } #[inline(always)] fn match_kind(&self) -> MatchKind { self.match_kind } #[inline(always)] fn patterns_len(&self) -> usize { self.pattern_lens.len() } #[inline(always)] fn pattern_len(&self, pid: PatternID) -> usize { self.pattern_lens[pid].as_usize() } #[inline(always)] fn min_pattern_len(&self) -> usize { self.min_pattern_len } #[inline(always)] fn max_pattern_len(&self) -> usize { self.max_pattern_len } #[inline(always)] fn match_len(&self, sid: StateID) -> usize { State::match_len(self.alphabet_len, &self.repr[sid.as_usize()..]) } #[inline(always)] fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { State::match_pattern( self.alphabet_len, &self.repr[sid.as_usize()..], index, ) } #[inline(always)] fn memory_usage(&self) -> usize { use core::mem::size_of; (self.repr.len() * size_of::()) + (self.pattern_lens.len() * size_of::()) + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) } #[inline(always)] fn prefilter(&self) -> Option<&Prefilter> { self.prefilter.as_ref() } } impl core::fmt::Debug for NFA { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use crate::automaton::fmt_state_indicator; writeln!(f, "contiguous::NFA(")?; let mut sid = NFA::DEAD; // always the first state and always present loop { let raw = &self.repr[sid.as_usize()..]; if raw.is_empty() { break; } let is_match = self.is_match(sid); let state = State::read(self.alphabet_len, is_match, raw); fmt_state_indicator(f, self, sid)?; write!( f, "{:06}({:06}): ", sid.as_usize(), state.fail.as_usize() )?; state.fmt(f)?; write!(f, "\n")?; if self.is_match(sid) { write!(f, " matches: ")?; for i in 0..state.match_len { let pid = State::match_pattern(self.alphabet_len, raw, i); if i > 0 { write!(f, ", ")?; } write!(f, "{}", pid.as_usize())?; } write!(f, "\n")?; } // The FAIL state doesn't actually have space for a state allocated // for it, so we have to treat it as a special case. write below // the DEAD state. if sid == NFA::DEAD { writeln!(f, "F {:06}:", NFA::FAIL.as_usize())?; } let len = State::len(self.alphabet_len, is_match, raw); sid = StateID::new(sid.as_usize().checked_add(len).unwrap()) .unwrap(); } writeln!(f, "match kind: {:?}", self.match_kind)?; writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?; writeln!(f, "state length: {:?}", self.state_len)?; writeln!(f, "pattern length: {:?}", self.patterns_len())?; writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?; writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?; writeln!(f, "alphabet length: {:?}", self.alphabet_len)?; writeln!(f, "byte classes: {:?}", self.byte_classes)?; writeln!(f, "memory usage: {:?}", self.memory_usage())?; writeln!(f, ")")?; Ok(()) } } /// The "in memory" representation a single dense or sparse state. /// /// A `State`'s in memory representation is not ever actually materialized /// during a search with a contiguous NFA. Doing so would be too slow. (Indeed, /// the only time a `State` is actually constructed is in `Debug` impls.) /// Instead, a `State` exposes a number of static methods for reading certain /// things from the raw binary encoding of the state. #[derive(Clone)] struct State<'a> { /// The state to transition to when 'class_to_next' yields a transition /// to the FAIL state. fail: StateID, /// The number of pattern IDs in this state. For a non-match state, this is /// always zero. Otherwise it is always bigger than zero. match_len: usize, /// The sparse or dense representation of the transitions for this state. trans: StateTrans<'a>, } /// The underlying representation of sparse or dense transitions for a state. /// /// Note that like `State`, we don't typically construct values of this type /// during a search since we don't always need all values and thus would /// represent a lot of wasteful work. #[derive(Clone)] enum StateTrans<'a> { /// A sparse representation of transitions for a state, where only non-FAIL /// transitions are explicitly represented. Sparse { classes: &'a [u32], /// The transitions for this state, where each transition is packed /// into a u32. The low 8 bits correspond to the byte class for the /// transition, and the high 24 bits correspond to the next state ID. /// /// This packing is why the max state ID allowed for a contiguous /// NFA is 2^24-1. nexts: &'a [u32], }, /// A "one transition" state that is never a match state. /// /// These are by far the most common state, so we use a specialized and /// very compact representation for them. One { /// The element of this NFA's alphabet that this transition is /// defined for. class: u8, /// The state this should transition to if the current symbol is /// equal to 'class'. next: u32, }, /// A dense representation of transitions for a state, where all /// transitions are explicitly represented, including transitions to the /// FAIL state. Dense { /// A dense set of transitions to other states. The transitions may /// point to a FAIL state, in which case, the search should try the /// same transition lookup at 'fail'. /// /// Note that this is indexed by byte equivalence classes and not /// byte values. That means 'class_to_next[byte]' is wrong and /// 'class_to_next[classes.get(byte)]' is correct. The number of /// transitions is always equivalent to 'classes.alphabet_len()'. class_to_next: &'a [u32], }, } impl<'a> State<'a> { /// The offset of where the "kind" of a state is stored. If it isn't one /// of the sentinel values below, then it's a sparse state and the kind /// corresponds to the number of transitions in the state. const KIND: usize = 0; /// A sentinel value indicating that the state uses a dense representation. const KIND_DENSE: u32 = 0xFF; /// A sentinel value indicating that the state uses a special "one /// transition" encoding. In practice, non-match states with one transition /// make up the overwhelming majority of all states in any given /// Aho-Corasick automaton, so we can specialize them using a very compact /// representation. const KIND_ONE: u32 = 0xFE; /// The maximum number of transitions to encode as a sparse state. Usually /// states with a lot of transitions are either very rare, or occur near /// the start state. In the latter case, they are probably dense already /// anyway. In the former case, making them dense is fine because they're /// rare. /// /// This needs to be small enough to permit each of the sentinel values for /// 'KIND' above. Namely, a sparse state embeds the number of transitions /// into the 'KIND'. Basically, "sparse" is a state kind too, but it's the /// "else" branch. /// /// N.B. There isn't anything particularly magical about 127 here. I /// just picked it because I figured any sparse state with this many /// transitions is going to be exceptionally rare, and if it did have this /// many transitions, then it would be quite slow to do a linear scan on /// the transitions during a search anyway. const MAX_SPARSE_TRANSITIONS: usize = 127; /// Remap state IDs in-place. /// /// `state` should be the the raw binary encoding of a state. (The start /// of the slice must correspond to the start of the state, but the slice /// may extend past the end of the encoding of the state.) fn remap( alphabet_len: usize, old_to_new: &[StateID], state: &mut [u32], ) -> Result<(), BuildError> { let kind = State::kind(state); if kind == State::KIND_DENSE { state[1] = old_to_new[state[1].as_usize()].as_u32(); for next in state[2..][..alphabet_len].iter_mut() { *next = old_to_new[next.as_usize()].as_u32(); } } else if kind == State::KIND_ONE { state[1] = old_to_new[state[1].as_usize()].as_u32(); state[2] = old_to_new[state[2].as_usize()].as_u32(); } else { let trans_len = State::sparse_trans_len(state); let classes_len = u32_len(trans_len); state[1] = old_to_new[state[1].as_usize()].as_u32(); for next in state[2 + classes_len..][..trans_len].iter_mut() { *next = old_to_new[next.as_usize()].as_u32(); } } Ok(()) } /// Returns the length, in number of u32s, of this state. /// /// This is useful for reading states consecutively, e.g., in the Debug /// impl without needing to store a separate map from state index to state /// identifier. /// /// `state` should be the the raw binary encoding of a state. (The start /// of the slice must correspond to the start of the state, but the slice /// may extend past the end of the encoding of the state.) fn len(alphabet_len: usize, is_match: bool, state: &[u32]) -> usize { let kind_len = 1; let fail_len = 1; let kind = State::kind(state); let (classes_len, trans_len) = if kind == State::KIND_DENSE { (0, alphabet_len) } else if kind == State::KIND_ONE { (0, 1) } else { let trans_len = State::sparse_trans_len(state); let classes_len = u32_len(trans_len); (classes_len, trans_len) }; let match_len = if !is_match { 0 } else if State::match_len(alphabet_len, state) == 1 { // This is a special case because when there is one pattern ID for // a match state, it is represented by a single u32 with its high // bit set (which is impossible for a valid pattern ID). 1 } else { // We add 1 to include the u32 that indicates the number of // pattern IDs that follow. 1 + State::match_len(alphabet_len, state) }; kind_len + fail_len + classes_len + trans_len + match_len } /// Returns the kind of this state. /// /// This only includes the low byte. #[inline(always)] fn kind(state: &[u32]) -> u32 { state[State::KIND] & 0xFF } /// Get the number of sparse transitions in this state. This can never /// be more than State::MAX_SPARSE_TRANSITIONS, as all states with more /// transitions are encoded as dense states. /// /// `state` should be the the raw binary encoding of a sparse state. (The /// start of the slice must correspond to the start of the state, but the /// slice may extend past the end of the encoding of the state.) If this /// isn't a sparse state, then the return value is unspecified. /// /// Do note that this is only legal to call on a sparse state. So for /// example, "one transition" state is not a sparse state, so it would not /// be legal to call this method on such a state. #[inline(always)] fn sparse_trans_len(state: &[u32]) -> usize { (state[State::KIND] & 0xFF).as_usize() } /// Returns the total number of matching pattern IDs in this state. Calling /// this on a state that isn't a match results in unspecified behavior. /// Thus, the returned number is never 0 for all correct calls. /// /// `state` should be the the raw binary encoding of a state. (The start /// of the slice must correspond to the start of the state, but the slice /// may extend past the end of the encoding of the state.) #[inline(always)] fn match_len(alphabet_len: usize, state: &[u32]) -> usize { // We don't need to handle KIND_ONE here because it can never be a // match state. let packed = if State::kind(state) == State::KIND_DENSE { let start = 2 + alphabet_len; state[start].as_usize() } else { let trans_len = State::sparse_trans_len(state); let classes_len = u32_len(trans_len); let start = 2 + classes_len + trans_len; state[start].as_usize() }; if packed & (1 << 31) == 0 { packed } else { 1 } } /// Returns the pattern ID corresponding to the given index for the state /// given. The `index` provided must be less than the number of pattern IDs /// in this state. /// /// `state` should be the the raw binary encoding of a state. (The start of /// the slice must correspond to the start of the state, but the slice may /// extend past the end of the encoding of the state.) /// /// If the given state is not a match state or if the index is out of /// bounds, then this has unspecified behavior. #[inline(always)] fn match_pattern( alphabet_len: usize, state: &[u32], index: usize, ) -> PatternID { // We don't need to handle KIND_ONE here because it can never be a // match state. let start = if State::kind(state) == State::KIND_DENSE { 2 + alphabet_len } else { let trans_len = State::sparse_trans_len(state); let classes_len = u32_len(trans_len); 2 + classes_len + trans_len }; let packed = state[start]; let pid = if packed & (1 << 31) == 0 { state[start + 1 + index] } else { assert_eq!(0, index); packed & !(1 << 31) }; PatternID::from_u32_unchecked(pid) } /// Read a state's binary encoding to its in-memory representation. /// /// `alphabet_len` should be the total number of transitions defined for /// dense states. /// /// `is_match` should be true if this state is a match state and false /// otherwise. /// /// `state` should be the the raw binary encoding of a state. (The start /// of the slice must correspond to the start of the state, but the slice /// may extend past the end of the encoding of the state.) fn read( alphabet_len: usize, is_match: bool, state: &'a [u32], ) -> State<'a> { let kind = State::kind(state); let match_len = if !is_match { 0 } else { State::match_len(alphabet_len, state) }; let (trans, fail) = if kind == State::KIND_DENSE { let fail = StateID::from_u32_unchecked(state[1]); let class_to_next = &state[2..][..alphabet_len]; (StateTrans::Dense { class_to_next }, fail) } else if kind == State::KIND_ONE { let fail = StateID::from_u32_unchecked(state[1]); let class = state[State::KIND].low_u16().high_u8(); let next = state[2]; (StateTrans::One { class, next }, fail) } else { let fail = StateID::from_u32_unchecked(state[1]); let trans_len = State::sparse_trans_len(state); let classes_len = u32_len(trans_len); let classes = &state[2..][..classes_len]; let nexts = &state[2 + classes_len..][..trans_len]; (StateTrans::Sparse { classes, nexts }, fail) }; State { fail, match_len, trans } } /// Encode the "old" state from a noncontiguous NFA to its binary /// representation to the given `dst` slice. `classes` should be the byte /// classes computed for the noncontiguous NFA that the given state came /// from. /// /// This returns an error if `dst` became so big that `StateID`s can no /// longer be created for new states. Otherwise, it returns the state ID of /// the new state created. /// /// When `force_dense` is true, then the encoded state will always use a /// dense format. Otherwise, the choice between dense and sparse will be /// automatically chosen based on the old state. fn write( nnfa: &noncontiguous::NFA, oldsid: StateID, old: &noncontiguous::State, classes: &ByteClasses, dst: &mut Vec, force_dense: bool, ) -> Result { let sid = StateID::new(dst.len()).map_err(|e| { BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) })?; let old_len = nnfa.iter_trans(oldsid).count(); // For states with a lot of transitions, we might as well just make // them dense. These kinds of hot states tend to be very rare, so we're // okay with it. This also gives us more sentinels in the state's // 'kind', which lets us create different state kinds to save on // space. let kind = if force_dense || old_len > State::MAX_SPARSE_TRANSITIONS { State::KIND_DENSE } else if old_len == 1 && !old.is_match() { State::KIND_ONE } else { // For a sparse state, the kind is just the number of transitions. u32::try_from(old_len).unwrap() }; if kind == State::KIND_DENSE { dst.push(kind); dst.push(old.fail().as_u32()); State::write_dense_trans(nnfa, oldsid, classes, dst)?; } else if kind == State::KIND_ONE { let t = nnfa.iter_trans(oldsid).next().unwrap(); let class = u32::from(classes.get(t.byte())); dst.push(kind | (class << 8)); dst.push(old.fail().as_u32()); dst.push(t.next().as_u32()); } else { dst.push(kind); dst.push(old.fail().as_u32()); State::write_sparse_trans(nnfa, oldsid, classes, dst)?; } // Now finally write the number of matches and the matches themselves. if old.is_match() { let matches_len = nnfa.iter_matches(oldsid).count(); if matches_len == 1 { let pid = nnfa.iter_matches(oldsid).next().unwrap().as_u32(); assert_eq!(0, pid & (1 << 31)); dst.push((1 << 31) | pid); } else { assert_eq!(0, matches_len & (1 << 31)); dst.push(matches_len.as_u32()); dst.extend(nnfa.iter_matches(oldsid).map(|pid| pid.as_u32())); } } Ok(sid) } /// Encode the "old" state transitions from a noncontiguous NFA to its /// binary sparse representation to the given `dst` slice. `classes` should /// be the byte classes computed for the noncontiguous NFA that the given /// state came from. /// /// This returns an error if `dst` became so big that `StateID`s can no /// longer be created for new states. fn write_sparse_trans( nnfa: &noncontiguous::NFA, oldsid: StateID, classes: &ByteClasses, dst: &mut Vec, ) -> Result<(), BuildError> { let (mut chunk, mut len) = ([0; 4], 0); for t in nnfa.iter_trans(oldsid) { chunk[len] = classes.get(t.byte()); len += 1; if len == 4 { dst.push(u32::from_ne_bytes(chunk)); chunk = [0; 4]; len = 0; } } if len > 0 { // In the case where the number of transitions isn't divisible // by 4, the last u32 chunk will have some left over room. In // this case, we "just" repeat the last equivalence class. By // doing this, we know the leftover faux transitions will never // be followed because if they were, it would have been followed // prior to it in the last equivalence class. This saves us some // branching in the search time state transition code. let repeat = chunk[len - 1]; while len < 4 { chunk[len] = repeat; len += 1; } dst.push(u32::from_ne_bytes(chunk)); } for t in nnfa.iter_trans(oldsid) { dst.push(t.next().as_u32()); } Ok(()) } /// Encode the "old" state transitions from a noncontiguous NFA to its /// binary dense representation to the given `dst` slice. `classes` should /// be the byte classes computed for the noncontiguous NFA that the given /// state came from. /// /// This returns an error if `dst` became so big that `StateID`s can no /// longer be created for new states. fn write_dense_trans( nnfa: &noncontiguous::NFA, oldsid: StateID, classes: &ByteClasses, dst: &mut Vec, ) -> Result<(), BuildError> { // Our byte classes let us shrink the size of our dense states to the // number of equivalence classes instead of just fixing it to 256. // Any non-explicitly defined transition is just a transition to the // FAIL state, so we fill that in first and then overwrite them with // explicitly defined transitions. (Most states probably only have one // or two explicitly defined transitions.) // // N.B. Remember that while building the contiguous NFA, we use state // IDs from the noncontiguous NFA. It isn't until we've added all // states that we go back and map noncontiguous IDs to contiguous IDs. let start = dst.len(); dst.extend( core::iter::repeat(noncontiguous::NFA::FAIL.as_u32()) .take(classes.alphabet_len()), ); assert!(start < dst.len(), "equivalence classes are never empty"); for t in nnfa.iter_trans(oldsid) { dst[start + usize::from(classes.get(t.byte()))] = t.next().as_u32(); } Ok(()) } /// Return an iterator over every explicitly defined transition in this /// state. fn transitions<'b>(&'b self) -> impl Iterator + 'b { let mut i = 0; core::iter::from_fn(move || match self.trans { StateTrans::Sparse { classes, nexts } => { if i >= nexts.len() { return None; } let chunk = classes[i / 4]; let class = chunk.to_ne_bytes()[i % 4]; let next = StateID::from_u32_unchecked(nexts[i]); i += 1; Some((class, next)) } StateTrans::One { class, next } => { if i == 0 { i += 1; Some((class, StateID::from_u32_unchecked(next))) } else { None } } StateTrans::Dense { class_to_next } => { if i >= class_to_next.len() { return None; } let class = i.as_u8(); let next = StateID::from_u32_unchecked(class_to_next[i]); i += 1; Some((class, next)) } }) } } impl<'a> core::fmt::Debug for State<'a> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use crate::{automaton::sparse_transitions, util::debug::DebugByte}; let it = sparse_transitions(self.transitions()) // Writing out all FAIL transitions is quite noisy. Instead, we // just require readers of the output to assume anything absent // maps to the FAIL transition. .filter(|&(_, _, sid)| sid != NFA::FAIL) .enumerate(); for (i, (start, end, sid)) in it { if i > 0 { write!(f, ", ")?; } if start == end { write!(f, "{:?} => {:?}", DebugByte(start), sid.as_usize())?; } else { write!( f, "{:?}-{:?} => {:?}", DebugByte(start), DebugByte(end), sid.as_usize() )?; } } Ok(()) } } /// A builder for configuring an Aho-Corasick contiguous NFA. /// /// This builder has a subset of the options available to a /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, /// their behavior is identical. #[derive(Clone, Debug)] pub struct Builder { noncontiguous: noncontiguous::Builder, dense_depth: usize, byte_classes: bool, } impl Default for Builder { fn default() -> Builder { Builder { noncontiguous: noncontiguous::Builder::new(), dense_depth: 2, byte_classes: true, } } } impl Builder { /// Create a new builder for configuring an Aho-Corasick contiguous NFA. pub fn new() -> Builder { Builder::default() } /// Build an Aho-Corasick contiguous NFA from the given iterator of /// patterns. /// /// A builder may be reused to create more NFAs. pub fn build(&self, patterns: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, { let nnfa = self.noncontiguous.build(patterns)?; self.build_from_noncontiguous(&nnfa) } /// Build an Aho-Corasick contiguous NFA from the given noncontiguous NFA. /// /// Note that when this method is used, only the `dense_depth` and /// `byte_classes` settings on this builder are respected. The other /// settings only apply to the initial construction of the Aho-Corasick /// automaton. Since using this method requires that initial construction /// has already completed, all settings impacting only initial construction /// are no longer relevant. pub fn build_from_noncontiguous( &self, nnfa: &noncontiguous::NFA, ) -> Result { debug!("building contiguous NFA"); let byte_classes = if self.byte_classes { nnfa.byte_classes().clone() } else { ByteClasses::singletons() }; let mut index_to_state_id = vec![NFA::DEAD; nnfa.states().len()]; let mut nfa = NFA { repr: vec![], pattern_lens: nnfa.pattern_lens_raw().to_vec(), state_len: nnfa.states().len(), prefilter: nnfa.prefilter().map(|p| p.clone()), match_kind: nnfa.match_kind(), alphabet_len: byte_classes.alphabet_len(), byte_classes, min_pattern_len: nnfa.min_pattern_len(), max_pattern_len: nnfa.max_pattern_len(), // The special state IDs are set later. special: Special::zero(), }; for (oldsid, state) in nnfa.states().iter().with_state_ids() { // We don't actually encode a fail state since it isn't necessary. // But we still want to make sure any FAIL ids are mapped // correctly. if oldsid == noncontiguous::NFA::FAIL { index_to_state_id[oldsid] = NFA::FAIL; continue; } let force_dense = state.depth().as_usize() < self.dense_depth; let newsid = State::write( nnfa, oldsid, state, &nfa.byte_classes, &mut nfa.repr, force_dense, )?; index_to_state_id[oldsid] = newsid; } for &newsid in index_to_state_id.iter() { if newsid == NFA::FAIL { continue; } let state = &mut nfa.repr[newsid.as_usize()..]; State::remap(nfa.alphabet_len, &index_to_state_id, state)?; } // Now that we've remapped all the IDs in our states, all that's left // is remapping the special state IDs. let remap = &index_to_state_id; let old = nnfa.special(); let new = &mut nfa.special; new.max_special_id = remap[old.max_special_id]; new.max_match_id = remap[old.max_match_id]; new.start_unanchored_id = remap[old.start_unanchored_id]; new.start_anchored_id = remap[old.start_anchored_id]; debug!( "contiguous NFA built, ", nfa.state_len, nfa.memory_usage(), nfa.byte_classes.alphabet_len(), ); // The vectors can grow ~twice as big during construction because a // Vec amortizes growth. But here, let's shrink things back down to // what we actually need since we're never going to add more to it. nfa.repr.shrink_to_fit(); nfa.pattern_lens.shrink_to_fit(); Ok(nfa) } /// Set the desired match semantics. /// /// This only applies when using [`Builder::build`] and not /// [`Builder::build_from_noncontiguous`]. /// /// See /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) /// for more documentation and examples. pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { self.noncontiguous.match_kind(kind); self } /// Enable ASCII-aware case insensitive matching. /// /// This only applies when using [`Builder::build`] and not /// [`Builder::build_from_noncontiguous`]. /// /// See /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) /// for more documentation and examples. pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { self.noncontiguous.ascii_case_insensitive(yes); self } /// Enable heuristic prefilter optimizations. /// /// This only applies when using [`Builder::build`] and not /// [`Builder::build_from_noncontiguous`]. /// /// See /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) /// for more documentation and examples. pub fn prefilter(&mut self, yes: bool) -> &mut Builder { self.noncontiguous.prefilter(yes); self } /// Set the limit on how many states use a dense representation for their /// transitions. Other states will generally use a sparse representation. /// /// See /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth) /// for more documentation and examples. pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { self.dense_depth = depth; self } /// A debug setting for whether to attempt to shrink the size of the /// automaton's alphabet or not. /// /// This should never be enabled unless you're debugging an automaton. /// Namely, disabling byte classes makes transitions easier to reason /// about, since they use the actual bytes instead of equivalence classes. /// Disabling this confers no performance benefit at search time. /// /// See /// [`AhoCorasickBuilder::byte_classes`](crate::AhoCorasickBuilder::byte_classes) /// for more documentation and examples. pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { self.byte_classes = yes; self } } /// Computes the number of u32 values needed to represent one byte per the /// number of transitions given. fn u32_len(ntrans: usize) -> usize { if ntrans % 4 == 0 { ntrans >> 2 } else { (ntrans >> 2) + 1 } } #[cfg(test)] mod tests { // This test demonstrates a SWAR technique I tried in the sparse transition // code inside of 'next_state'. Namely, sparse transitions work by // iterating over u32 chunks, with each chunk containing up to 4 classes // corresponding to 4 transitions. This SWAR technique lets us find a // matching transition without converting the u32 to a [u8; 4]. // // It turned out to be a little slower unfortunately, which isn't too // surprising, since this is likely a throughput oriented optimization. // Loop unrolling doesn't really help us because the vast majority of // states have very few transitions. // // Anyway, this code was a little tricky to write, so I converted it to a // test in case someone figures out how to use it more effectively than // I could. // // (This also only works on little endian. So big endian would need to be // accounted for if we ever decided to use this I think.) #[cfg(target_endian = "little")] #[test] fn swar() { use super::*; fn has_zero_byte(x: u32) -> u32 { const LO_U32: u32 = 0x01010101; const HI_U32: u32 = 0x80808080; x.wrapping_sub(LO_U32) & !x & HI_U32 } fn broadcast(b: u8) -> u32 { (u32::from(b)) * (u32::MAX / 255) } fn index_of(x: u32) -> usize { let o = (((x - 1) & 0x01010101).wrapping_mul(0x01010101) >> 24) - 1; o.as_usize() } let bytes: [u8; 4] = [b'1', b'A', b'a', b'z']; let chunk = u32::from_ne_bytes(bytes); let needle = broadcast(b'1'); assert_eq!(0, index_of(has_zero_byte(needle ^ chunk))); let needle = broadcast(b'A'); assert_eq!(1, index_of(has_zero_byte(needle ^ chunk))); let needle = broadcast(b'a'); assert_eq!(2, index_of(has_zero_byte(needle ^ chunk))); let needle = broadcast(b'z'); assert_eq!(3, index_of(has_zero_byte(needle ^ chunk))); } } aho-corasick-1.1.3/src/nfa/mod.rs000064400000000000000000000044251046102023000146560ustar 00000000000000/*! Provides direct access to NFA implementations of Aho-Corasick. The principle characteristic of an NFA in this crate is that it may transition through multiple states per byte of haystack. In Aho-Corasick parlance, NFAs follow failure transitions during a search. In contrast, a [`DFA`](crate::dfa::DFA) pre-computes all failure transitions during compilation at the expense of a much bigger memory footprint. Currently, there are two NFA implementations provided: noncontiguous and contiguous. The names reflect their internal representation, and consequently, the trade offs associated with them: * A [`noncontiguous::NFA`] uses a separate allocation for every NFA state to represent its transitions in a sparse format. This is ideal for building an NFA, since it cheaply permits different states to have a different number of transitions. A noncontiguous NFA is where the main Aho-Corasick construction algorithm is implemented. All other Aho-Corasick implementations are built by first constructing a noncontiguous NFA. * A [`contiguous::NFA`] is uses a single allocation to represent all states, while still encoding most states as sparse states but permitting states near the starting state to have a dense representation. The dense representation uses more memory, but permits computing transitions during a search more quickly. By only making the most active states dense (the states near the starting state), a contiguous NFA better balances memory usage with search speed. The single contiguous allocation also uses less overhead per state and enables compression tricks where most states only use 8 bytes of heap memory. When given the choice between these two, you almost always want to pick a contiguous NFA. It takes only a little longer to build, but both its memory usage and search speed are typically much better than a noncontiguous NFA. A noncontiguous NFA is useful when prioritizing build times, or when there are so many patterns that a contiguous NFA could not be built. (Currently, because of both memory and search speed improvements, a contiguous NFA has a smaller internal limit on the total number of NFA states it can represent. But you would likely need to have hundreds of thousands or even millions of patterns before you hit this limit.) */ pub mod contiguous; pub mod noncontiguous; aho-corasick-1.1.3/src/nfa/noncontiguous.rs000064400000000000000000002237531046102023000170200ustar 00000000000000/*! Provides a noncontiguous NFA implementation of Aho-Corasick. This is a low-level API that generally only needs to be used in niche circumstances. When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of a noncontiguous NFA directly. Using an `NFA` directly is typically only necessary when one needs access to the [`Automaton`] trait implementation. */ use alloc::{ collections::{BTreeSet, VecDeque}, vec, vec::Vec, }; use crate::{ automaton::Automaton, util::{ alphabet::{ByteClassSet, ByteClasses}, error::{BuildError, MatchError}, prefilter::{self, opposite_ascii_case, Prefilter}, primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, remapper::Remapper, search::{Anchored, MatchKind}, special::Special, }, }; /// A noncontiguous NFA implementation of Aho-Corasick. /// /// When possible, prefer using [`AhoCorasick`](crate::AhoCorasick) instead of /// this type directly. Using an `NFA` directly is typically only necessary /// when one needs access to the [`Automaton`] trait implementation. /// /// This NFA represents the "core" implementation of Aho-Corasick in this /// crate. Namely, constructing this NFA involving building a trie and then /// filling in the failure transitions between states, similar to what is /// described in any standard textbook description of Aho-Corasick. /// /// In order to minimize heap usage and to avoid additional construction costs, /// this implementation represents the transitions of all states as distinct /// sparse memory allocations. This is where it gets its name from. That is, /// this NFA has no contiguous memory allocation for its transition table. Each /// state gets its own allocation. /// /// While the sparse representation keeps memory usage to somewhat reasonable /// levels, it is still quite large and also results in somewhat mediocre /// search performance. For this reason, it is almost always a good idea to /// use a [`contiguous::NFA`](crate::nfa::contiguous::NFA) instead. It is /// marginally slower to build, but has higher throughput and can sometimes use /// an order of magnitude less memory. The main reason to use a noncontiguous /// NFA is when you need the fastest possible construction time, or when a /// contiguous NFA does not have the desired capacity. (The total number of NFA /// states it can have is fewer than a noncontiguous NFA.) /// /// # Example /// /// This example shows how to build an `NFA` directly and use it to execute /// [`Automaton::try_find`]: /// /// ``` /// use aho_corasick::{ /// automaton::Automaton, /// nfa::noncontiguous::NFA, /// Input, Match, /// }; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let nfa = NFA::new(patterns).unwrap(); /// assert_eq!( /// Some(Match::must(0, 1..2)), /// nfa.try_find(&Input::new(haystack))?, /// ); /// # Ok::<(), Box>(()) /// ``` /// /// It is also possible to implement your own version of `try_find`. See the /// [`Automaton`] documentation for an example. #[derive(Clone)] pub struct NFA { /// The match semantics built into this NFA. match_kind: MatchKind, /// A set of states. Each state defines its own transitions, a fail /// transition and a set of indices corresponding to matches. /// /// The first state is always the fail state, which is used only as a /// sentinel. Namely, in the final NFA, no transition into the fail state /// exists. (Well, they do, but they aren't followed. Instead, the state's /// failure transition is followed.) /// /// The second state (index 1) is always the dead state. Dead states are /// in every automaton, but only used when leftmost-{first,longest} match /// semantics are enabled. Specifically, they instruct search to stop /// at specific points in order to report the correct match location. In /// the standard Aho-Corasick construction, there are no transitions to /// the dead state. /// /// The third state (index 2) is generally intended to be the starting or /// "root" state. states: Vec, /// Transitions stored in a sparse representation via a linked list. /// /// Each transition contains three pieces of information: the byte it /// is defined for, the state it transitions to and a link to the next /// transition in the same state (or `StateID::ZERO` if it is the last /// transition). /// /// The first transition for each state is determined by `State::sparse`. /// /// Note that this contains a complete set of all transitions in this NFA, /// including states that have a dense representation for transitions. /// (Adding dense transitions for a state doesn't remove its sparse /// transitions, since deleting transitions from this particular sparse /// representation would be fairly expensive.) sparse: Vec, /// Transitions stored in a dense representation. /// /// A state has a row in this table if and only if `State::dense` is /// not equal to `StateID::ZERO`. When not zero, there are precisely /// `NFA::byte_classes::alphabet_len()` entries beginning at `State::dense` /// in this table. /// /// Generally a very small minority of states have a dense representation /// since it uses so much memory. dense: Vec, /// Matches stored in linked list for each state. /// /// Like sparse transitions, each match has a link to the next match in the /// state. /// /// The first match for each state is determined by `State::matches`. matches: Vec, /// The length, in bytes, of each pattern in this NFA. This slice is /// indexed by `PatternID`. /// /// The number of entries in this vector corresponds to the total number of /// patterns in this automaton. pattern_lens: Vec, /// A prefilter for quickly skipping to candidate matches, if pertinent. prefilter: Option, /// A set of equivalence classes in terms of bytes. We compute this while /// building the NFA, but don't use it in the NFA's states. Instead, we /// use this for building the DFA. We store it on the NFA since it's easy /// to compute while visiting the patterns. byte_classes: ByteClasses, /// The length, in bytes, of the shortest pattern in this automaton. This /// information is useful for detecting whether an automaton matches the /// empty string or not. min_pattern_len: usize, /// The length, in bytes, of the longest pattern in this automaton. This /// information is useful for keeping correct buffer sizes when searching /// on streams. max_pattern_len: usize, /// The information required to deduce which states are "special" in this /// NFA. /// /// Since the DEAD and FAIL states are always the first two states and /// there are only ever two start states (which follow all of the match /// states), it follows that we can determine whether a state is a fail, /// dead, match or start with just a few comparisons on the ID itself: /// /// is_dead(sid): sid == NFA::DEAD /// is_fail(sid): sid == NFA::FAIL /// is_match(sid): NFA::FAIL < sid && sid <= max_match_id /// is_start(sid): sid == start_unanchored_id || sid == start_anchored_id /// /// Note that this only applies to the NFA after it has been constructed. /// During construction, the start states are the first ones added and the /// match states are inter-leaved with non-match states. Once all of the /// states have been added, the states are shuffled such that the above /// predicates hold. special: Special, } impl NFA { /// Create a new Aho-Corasick noncontiguous NFA using the default /// configuration. /// /// Use a [`Builder`] if you want to change the configuration. pub fn new(patterns: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, { NFA::builder().build(patterns) } /// A convenience method for returning a new Aho-Corasick noncontiguous NFA /// builder. /// /// This usually permits one to just import the `NFA` type. pub fn builder() -> Builder { Builder::new() } } impl NFA { /// The DEAD state is a sentinel state like the FAIL state. The DEAD state /// instructs any search to stop and return any currently recorded match, /// or no match otherwise. Generally speaking, it is impossible for an /// unanchored standard search to enter a DEAD state. But an anchored /// search can, and so to can a leftmost search. /// /// We put DEAD before FAIL so that DEAD is always 0. We repeat this /// decision across the other Aho-Corasicm automata, so that DEAD /// states there are always 0 too. It's not that we need all of the /// implementations to agree, but rather, the contiguous NFA and the DFA /// use a sort of "premultiplied" state identifier where the only state /// whose ID is always known and constant is the first state. Subsequent /// state IDs depend on how much space has already been used in the /// transition table. pub(crate) const DEAD: StateID = StateID::new_unchecked(0); /// The FAIL state mostly just corresponds to the ID of any transition on a /// state that isn't explicitly defined. When one transitions into the FAIL /// state, one must follow the previous state's failure transition before /// doing the next state lookup. In this way, FAIL is more of a sentinel /// than a state that one actually transitions into. In particular, it is /// never exposed in the `Automaton` interface. pub(crate) const FAIL: StateID = StateID::new_unchecked(1); /// Returns the equivalence classes of bytes found while constructing /// this NFA. /// /// Note that the NFA doesn't actually make use of these equivalence /// classes. Instead, these are useful for building the DFA when desired. pub(crate) fn byte_classes(&self) -> &ByteClasses { &self.byte_classes } /// Returns a slice containing the length of each pattern in this searcher. /// It is indexed by `PatternID` and has length `NFA::patterns_len`. /// /// This is exposed for convenience when building a contiguous NFA. But it /// can be reconstructed from the `Automaton` API if necessary. pub(crate) fn pattern_lens_raw(&self) -> &[SmallIndex] { &self.pattern_lens } /// Returns a slice of all states in this non-contiguous NFA. pub(crate) fn states(&self) -> &[State] { &self.states } /// Returns the underlying "special" state information for this NFA. pub(crate) fn special(&self) -> &Special { &self.special } /// Swaps the states at `id1` and `id2`. /// /// This does not update the transitions of any state to account for the /// state swap. pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { self.states.swap(id1.as_usize(), id2.as_usize()); } /// Re-maps all state IDs in this NFA according to the `map` function /// given. pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { let alphabet_len = self.byte_classes.alphabet_len(); for state in self.states.iter_mut() { state.fail = map(state.fail); let mut link = state.sparse; while link != StateID::ZERO { let t = &mut self.sparse[link]; t.next = map(t.next); link = t.link; } if state.dense != StateID::ZERO { let start = state.dense.as_usize(); for next in self.dense[start..][..alphabet_len].iter_mut() { *next = map(*next); } } } } /// Iterate over all of the transitions for the given state ID. pub(crate) fn iter_trans( &self, sid: StateID, ) -> impl Iterator + '_ { let mut link = self.states[sid].sparse; core::iter::from_fn(move || { if link == StateID::ZERO { return None; } let t = self.sparse[link]; link = t.link; Some(t) }) } /// Iterate over all of the matches for the given state ID. pub(crate) fn iter_matches( &self, sid: StateID, ) -> impl Iterator + '_ { let mut link = self.states[sid].matches; core::iter::from_fn(move || { if link == StateID::ZERO { return None; } let m = self.matches[link]; link = m.link; Some(m.pid) }) } /// Return the link following the one given. If the one given is the last /// link for the given state, then return `None`. /// /// If no previous link is given, then this returns the first link in the /// state, if one exists. /// /// This is useful for manually iterating over the transitions in a single /// state without borrowing the NFA. This permits mutating other parts of /// the NFA during iteration. Namely, one can access the transition pointed /// to by the link via `self.sparse[link]`. fn next_link( &self, sid: StateID, prev: Option, ) -> Option { let link = prev.map_or(self.states[sid].sparse, |p| self.sparse[p].link); if link == StateID::ZERO { None } else { Some(link) } } /// Follow the transition for the given byte in the given state. If no such /// transition exists, then the FAIL state ID is returned. #[inline(always)] fn follow_transition(&self, sid: StateID, byte: u8) -> StateID { let s = &self.states[sid]; // This is a special case that targets starting states and states // near a start state. Namely, after the initial trie is constructed, // we look for states close to the start state to convert to a dense // representation for their transitions. This winds up using a lot more // memory per state in exchange for faster transition lookups. But // since we only do this for a small number of states (by default), the // memory usage is usually minimal. // // This has *massive* benefit when executing searches because the // unanchored starting state is by far the hottest state and is // frequently visited. Moreover, the 'for' loop below that works // decently on an actually sparse state is disastrous on a state that // is nearly or completely dense. if s.dense == StateID::ZERO { self.follow_transition_sparse(sid, byte) } else { let class = usize::from(self.byte_classes.get(byte)); self.dense[s.dense.as_usize() + class] } } /// Like `follow_transition`, but always uses the sparse representation. #[inline(always)] fn follow_transition_sparse(&self, sid: StateID, byte: u8) -> StateID { for t in self.iter_trans(sid) { if byte <= t.byte { if byte == t.byte { return t.next; } break; } } NFA::FAIL } /// Set the transition for the given byte to the state ID given. /// /// Note that one should not set transitions to the FAIL state. It is not /// technically incorrect, but it wastes space. If a transition is not /// defined, then it is automatically assumed to lead to the FAIL state. fn add_transition( &mut self, prev: StateID, byte: u8, next: StateID, ) -> Result<(), BuildError> { if self.states[prev].dense != StateID::ZERO { let dense = self.states[prev].dense; let class = usize::from(self.byte_classes.get(byte)); self.dense[dense.as_usize() + class] = next; } let head = self.states[prev].sparse; if head == StateID::ZERO || byte < self.sparse[head].byte { let new_link = self.alloc_transition()?; self.sparse[new_link] = Transition { byte, next, link: head }; self.states[prev].sparse = new_link; return Ok(()); } else if byte == self.sparse[head].byte { self.sparse[head].next = next; return Ok(()); } // We handled the only cases where the beginning of the transition // chain needs to change. At this point, we now know that there is // at least one entry in the transition chain and the byte for that // transition is less than the byte for the transition we're adding. let (mut link_prev, mut link_next) = (head, self.sparse[head].link); while link_next != StateID::ZERO && byte > self.sparse[link_next].byte { link_prev = link_next; link_next = self.sparse[link_next].link; } if link_next == StateID::ZERO || byte < self.sparse[link_next].byte { let link = self.alloc_transition()?; self.sparse[link] = Transition { byte, next, link: link_next }; self.sparse[link_prev].link = link; } else { assert_eq!(byte, self.sparse[link_next].byte); self.sparse[link_next].next = next; } Ok(()) } /// This sets every possible transition (all 255 of them) for the given /// state to the name `next` value. /// /// This is useful for efficiently initializing start/dead states. /// /// # Panics /// /// This requires that the state has no transitions added to it already. /// If it has any transitions, then this panics. It will also panic if /// the state has been densified prior to calling this. fn init_full_state( &mut self, prev: StateID, next: StateID, ) -> Result<(), BuildError> { assert_eq!( StateID::ZERO, self.states[prev].dense, "state must not be dense yet" ); assert_eq!( StateID::ZERO, self.states[prev].sparse, "state must have zero transitions" ); let mut prev_link = StateID::ZERO; for byte in 0..=255 { let new_link = self.alloc_transition()?; self.sparse[new_link] = Transition { byte, next, link: StateID::ZERO }; if prev_link == StateID::ZERO { self.states[prev].sparse = new_link; } else { self.sparse[prev_link].link = new_link; } prev_link = new_link; } Ok(()) } /// Add a match for the given pattern ID to the state for the given ID. fn add_match( &mut self, sid: StateID, pid: PatternID, ) -> Result<(), BuildError> { let head = self.states[sid].matches; let mut link = head; while self.matches[link].link != StateID::ZERO { link = self.matches[link].link; } let new_match_link = self.alloc_match()?; self.matches[new_match_link].pid = pid; if link == StateID::ZERO { self.states[sid].matches = new_match_link; } else { self.matches[link].link = new_match_link; } Ok(()) } /// Copy matches from the `src` state to the `dst` state. This is useful /// when a match state can be reached via a failure transition. In which /// case, you'll want to copy the matches (if any) from the state reached /// by the failure transition to the original state you were at. fn copy_matches( &mut self, src: StateID, dst: StateID, ) -> Result<(), BuildError> { let head_dst = self.states[dst].matches; let mut link_dst = head_dst; while self.matches[link_dst].link != StateID::ZERO { link_dst = self.matches[link_dst].link; } let mut link_src = self.states[src].matches; while link_src != StateID::ZERO { let new_match_link = StateID::new(self.matches.len()).map_err(|e| { BuildError::state_id_overflow( StateID::MAX.as_u64(), e.attempted(), ) })?; self.matches.push(Match { pid: self.matches[link_src].pid, link: StateID::ZERO, }); if link_dst == StateID::ZERO { self.states[dst].matches = new_match_link; } else { self.matches[link_dst].link = new_match_link; } link_dst = new_match_link; link_src = self.matches[link_src].link; } Ok(()) } /// Create a new entry in `NFA::trans`, if there's room, and return that /// entry's ID. If there's no room, then an error is returned. fn alloc_transition(&mut self) -> Result { let id = StateID::new(self.sparse.len()).map_err(|e| { BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) })?; self.sparse.push(Transition::default()); Ok(id) } /// Create a new entry in `NFA::matches`, if there's room, and return that /// entry's ID. If there's no room, then an error is returned. fn alloc_match(&mut self) -> Result { let id = StateID::new(self.matches.len()).map_err(|e| { BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) })?; self.matches.push(Match::default()); Ok(id) } /// Create a new set of `N` transitions in this NFA's dense transition /// table. The ID return corresponds to the index at which the `N` /// transitions begin. So `id+0` is the first transition and `id+(N-1)` is /// the last. /// /// `N` is determined via `NFA::byte_classes::alphabet_len`. fn alloc_dense_state(&mut self) -> Result { let id = StateID::new(self.dense.len()).map_err(|e| { BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) })?; // We use FAIL because it's the correct default. If a state doesn't // have a transition defined for every possible byte value, then the // transition function should return NFA::FAIL. self.dense.extend( core::iter::repeat(NFA::FAIL) .take(self.byte_classes.alphabet_len()), ); Ok(id) } /// Allocate and add a fresh state to the underlying NFA and return its /// ID (guaranteed to be one more than the ID of the previously allocated /// state). If the ID would overflow `StateID`, then this returns an error. fn alloc_state(&mut self, depth: usize) -> Result { // This is OK because we error when building the trie if we see a // pattern whose length cannot fit into a 'SmallIndex', and the longest // possible depth corresponds to the length of the longest pattern. let depth = SmallIndex::new(depth) .expect("patterns longer than SmallIndex::MAX are not allowed"); let id = StateID::new(self.states.len()).map_err(|e| { BuildError::state_id_overflow(StateID::MAX.as_u64(), e.attempted()) })?; self.states.push(State { sparse: StateID::ZERO, dense: StateID::ZERO, matches: StateID::ZERO, fail: self.special.start_unanchored_id, depth, }); Ok(id) } } // SAFETY: 'start_state' always returns a valid state ID, 'next_state' always // returns a valid state ID given a valid state ID. We otherwise claim that // all other methods are correct as well. unsafe impl Automaton for NFA { #[inline(always)] fn start_state(&self, anchored: Anchored) -> Result { match anchored { Anchored::No => Ok(self.special.start_unanchored_id), Anchored::Yes => Ok(self.special.start_anchored_id), } } #[inline(always)] fn next_state( &self, anchored: Anchored, mut sid: StateID, byte: u8, ) -> StateID { // This terminates since: // // 1. state.fail never points to the FAIL state. // 2. All state.fail values point to a state closer to the start state. // 3. The start state has no transitions to the FAIL state. loop { let next = self.follow_transition(sid, byte); if next != NFA::FAIL { return next; } // For an anchored search, we never follow failure transitions // because failure transitions lead us down a path to matching // a *proper* suffix of the path we were on. Thus, it can only // produce matches that appear after the beginning of the search. if anchored.is_anchored() { return NFA::DEAD; } sid = self.states[sid].fail(); } } #[inline(always)] fn is_special(&self, sid: StateID) -> bool { sid <= self.special.max_special_id } #[inline(always)] fn is_dead(&self, sid: StateID) -> bool { sid == NFA::DEAD } #[inline(always)] fn is_match(&self, sid: StateID) -> bool { // N.B. This returns true when sid==NFA::FAIL but that's okay because // NFA::FAIL is not actually a valid state ID from the perspective of // the Automaton trait. Namely, it is never returned by 'start_state' // or by 'next_state'. So we don't need to care about it here. !self.is_dead(sid) && sid <= self.special.max_match_id } #[inline(always)] fn is_start(&self, sid: StateID) -> bool { sid == self.special.start_unanchored_id || sid == self.special.start_anchored_id } #[inline(always)] fn match_kind(&self) -> MatchKind { self.match_kind } #[inline(always)] fn patterns_len(&self) -> usize { self.pattern_lens.len() } #[inline(always)] fn pattern_len(&self, pid: PatternID) -> usize { self.pattern_lens[pid].as_usize() } #[inline(always)] fn min_pattern_len(&self) -> usize { self.min_pattern_len } #[inline(always)] fn max_pattern_len(&self) -> usize { self.max_pattern_len } #[inline(always)] fn match_len(&self, sid: StateID) -> usize { self.iter_matches(sid).count() } #[inline(always)] fn match_pattern(&self, sid: StateID, index: usize) -> PatternID { self.iter_matches(sid).nth(index).unwrap() } #[inline(always)] fn memory_usage(&self) -> usize { self.states.len() * core::mem::size_of::() + self.sparse.len() * core::mem::size_of::() + self.matches.len() * core::mem::size_of::() + self.dense.len() * StateID::SIZE + self.pattern_lens.len() * SmallIndex::SIZE + self.prefilter.as_ref().map_or(0, |p| p.memory_usage()) } #[inline(always)] fn prefilter(&self) -> Option<&Prefilter> { self.prefilter.as_ref() } } /// A representation of a sparse NFA state for an Aho-Corasick automaton. /// /// It contains the transitions to the next state, a failure transition for /// cases where there exists no other transition for the current input byte /// and the matches implied by visiting this state (if any). #[derive(Clone, Debug)] pub(crate) struct State { /// A pointer to `NFA::trans` corresponding to the head of a linked list /// containing all of the transitions for this state. /// /// This is `StateID::ZERO` if and only if this state has zero transitions. sparse: StateID, /// A pointer to a row of `N` transitions in `NFA::dense`. These /// transitions correspond precisely to what is obtained by traversing /// `sparse`, but permits constant time lookup. /// /// When this is zero (which is true for most states in the default /// configuration), then this state has no dense representation. /// /// Note that `N` is equal to `NFA::byte_classes::alphabet_len()`. This is /// typically much less than 256 (the maximum value). dense: StateID, /// A pointer to `NFA::matches` corresponding to the head of a linked list /// containing all of the matches for this state. /// /// This is `StateID::ZERO` if and only if this state is not a match state. matches: StateID, /// The state that should be transitioned to if the current byte in the /// haystack does not have a corresponding transition defined in this /// state. fail: StateID, /// The depth of this state. Specifically, this is the distance from this /// state to the starting state. (For the special sentinel states DEAD and /// FAIL, their depth is always 0.) The depth of a starting state is 0. /// /// Note that depth is currently not used in this non-contiguous NFA. It /// may in the future, but it is used in the contiguous NFA. Namely, it /// permits an optimization where states near the starting state have their /// transitions stored in a dense fashion, but all other states have their /// transitions stored in a sparse fashion. (This non-contiguous NFA uses /// a sparse representation for all states unconditionally.) In any case, /// this is really the only convenient place to compute and store this /// information, which we need when building the contiguous NFA. depth: SmallIndex, } impl State { /// Return true if and only if this state is a match state. pub(crate) fn is_match(&self) -> bool { self.matches != StateID::ZERO } /// Returns the failure transition for this state. pub(crate) fn fail(&self) -> StateID { self.fail } /// Returns the depth of this state. That is, the number of transitions /// this state is from the start state of the NFA. pub(crate) fn depth(&self) -> SmallIndex { self.depth } } /// A single transition in a non-contiguous NFA. #[derive(Clone, Copy, Default)] #[repr(packed)] pub(crate) struct Transition { byte: u8, next: StateID, link: StateID, } impl Transition { /// Return the byte for which this transition is defined. pub(crate) fn byte(&self) -> u8 { self.byte } /// Return the ID of the state that this transition points to. pub(crate) fn next(&self) -> StateID { self.next } /// Return the ID of the next transition. fn link(&self) -> StateID { self.link } } impl core::fmt::Debug for Transition { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "Transition(byte: {:X?}, next: {:?}, link: {:?})", self.byte, self.next().as_usize(), self.link().as_usize() ) } } /// A single match in a non-contiguous NFA. #[derive(Clone, Copy, Default)] struct Match { pid: PatternID, link: StateID, } impl Match { /// Return the pattern ID for this match. pub(crate) fn pattern(&self) -> PatternID { self.pid } /// Return the ID of the next match. fn link(&self) -> StateID { self.link } } impl core::fmt::Debug for Match { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "Match(pid: {:?}, link: {:?})", self.pattern().as_usize(), self.link().as_usize() ) } } /// A builder for configuring an Aho-Corasick noncontiguous NFA. /// /// This builder has a subset of the options available to a /// [`AhoCorasickBuilder`](crate::AhoCorasickBuilder). Of the shared options, /// their behavior is identical. #[derive(Clone, Debug)] pub struct Builder { match_kind: MatchKind, prefilter: bool, ascii_case_insensitive: bool, dense_depth: usize, } impl Default for Builder { fn default() -> Builder { Builder { match_kind: MatchKind::default(), prefilter: true, ascii_case_insensitive: false, dense_depth: 3, } } } impl Builder { /// Create a new builder for configuring an Aho-Corasick noncontiguous NFA. pub fn new() -> Builder { Builder::default() } /// Build an Aho-Corasick noncontiguous NFA from the given iterator of /// patterns. /// /// A builder may be reused to create more NFAs. pub fn build(&self, patterns: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, { debug!("building non-contiguous NFA"); let nfa = Compiler::new(self)?.compile(patterns)?; debug!( "non-contiguous NFA built, ", nfa.states.len(), nfa.memory_usage() ); Ok(nfa) } /// Set the desired match semantics. /// /// See /// [`AhoCorasickBuilder::match_kind`](crate::AhoCorasickBuilder::match_kind) /// for more documentation and examples. pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { self.match_kind = kind; self } /// Enable ASCII-aware case insensitive matching. /// /// See /// [`AhoCorasickBuilder::ascii_case_insensitive`](crate::AhoCorasickBuilder::ascii_case_insensitive) /// for more documentation and examples. pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { self.ascii_case_insensitive = yes; self } /// Set the limit on how many states use a dense representation for their /// transitions. Other states will generally use a sparse representation. /// /// See /// [`AhoCorasickBuilder::dense_depth`](crate::AhoCorasickBuilder::dense_depth) /// for more documentation and examples. pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { self.dense_depth = depth; self } /// Enable heuristic prefilter optimizations. /// /// See /// [`AhoCorasickBuilder::prefilter`](crate::AhoCorasickBuilder::prefilter) /// for more documentation and examples. pub fn prefilter(&mut self, yes: bool) -> &mut Builder { self.prefilter = yes; self } } /// A compiler uses a builder configuration and builds up the NFA formulation /// of an Aho-Corasick automaton. This roughly corresponds to the standard /// formulation described in textbooks, with some tweaks to support leftmost /// searching. #[derive(Debug)] struct Compiler<'a> { builder: &'a Builder, prefilter: prefilter::Builder, nfa: NFA, byteset: ByteClassSet, } impl<'a> Compiler<'a> { fn new(builder: &'a Builder) -> Result, BuildError> { let prefilter = prefilter::Builder::new(builder.match_kind) .ascii_case_insensitive(builder.ascii_case_insensitive); Ok(Compiler { builder, prefilter, nfa: NFA { match_kind: builder.match_kind, states: vec![], sparse: vec![], dense: vec![], matches: vec![], pattern_lens: vec![], prefilter: None, byte_classes: ByteClasses::singletons(), min_pattern_len: usize::MAX, max_pattern_len: 0, special: Special::zero(), }, byteset: ByteClassSet::empty(), }) } fn compile(mut self, patterns: I) -> Result where I: IntoIterator, P: AsRef<[u8]>, { // Add dummy transition/match links, so that no valid link will point // to another link at index 0. self.nfa.sparse.push(Transition::default()); self.nfa.matches.push(Match::default()); // Add a dummy dense transition so that no states can have dense==0 // represent a valid pointer to dense transitions. This permits // dense==0 to be a sentinel indicating "no dense transitions." self.nfa.dense.push(NFA::DEAD); // the dead state, only used for leftmost and fixed to id==0 self.nfa.alloc_state(0)?; // the fail state, which is never entered and fixed to id==1 self.nfa.alloc_state(0)?; // unanchored start state, initially fixed to id==2 but later shuffled // to appear after all non-start match states. self.nfa.special.start_unanchored_id = self.nfa.alloc_state(0)?; // anchored start state, initially fixed to id==3 but later shuffled // to appear after unanchored start state. self.nfa.special.start_anchored_id = self.nfa.alloc_state(0)?; // Initialize the unanchored starting state in order to make it dense, // and thus make transition lookups on this state faster. self.init_unanchored_start_state()?; // Set all transitions on the DEAD state to point to itself. This way, // the DEAD state can never be escaped. It MUST be used as a sentinel // in any correct search. self.add_dead_state_loop()?; // Build the base trie from the given patterns. self.build_trie(patterns)?; self.nfa.states.shrink_to_fit(); // Turn our set of bytes into equivalent classes. This NFA // implementation uses byte classes only for states that use a dense // representation of transitions. (And that's why this comes before // `self.densify()`, as the byte classes need to be set first.) self.nfa.byte_classes = self.byteset.byte_classes(); // Add transitions (and maybe matches) to the anchored starting state. // The anchored starting state is used for anchored searches. The only // mechanical difference between it and the unanchored start state is // that missing transitions map to the DEAD state instead of the FAIL // state. self.set_anchored_start_state()?; // Rewrite transitions to the FAIL state on the unanchored start state // as self-transitions. This keeps the start state active at all times. self.add_unanchored_start_state_loop(); // Make some (possibly zero) states use a dense representation for // transitions. It's important to do this right after the states // and non-failure transitions are solidified. That way, subsequent // accesses (particularly `fill_failure_transitions`) will benefit from // the faster transition lookup in densified states. self.densify()?; // The meat of the Aho-Corasick algorithm: compute and write failure // transitions. i.e., the state to move to when a transition isn't // defined in the current state. These are epsilon transitions and thus // make this formulation an NFA. self.fill_failure_transitions()?; // Handle a special case under leftmost semantics when at least one // of the patterns is the empty string. self.close_start_state_loop_for_leftmost(); // Shuffle states so that we have DEAD, FAIL, MATCH, ..., START, START, // NON-MATCH, ... This permits us to very quickly query the type of // the state we're currently in during a search. self.shuffle(); self.nfa.prefilter = self.prefilter.build(); // Store the maximum ID of all *relevant* special states. Start states // are only relevant when we have a prefilter, otherwise, there is zero // reason to care about whether a state is a start state or not during // a search. Indeed, without a prefilter, we are careful to explicitly // NOT care about start states, otherwise the search can ping pong // between the unrolled loop and the handling of special-status states // and destroy perf. self.nfa.special.max_special_id = if self.nfa.prefilter.is_some() { // Why the anchored starting state? Because we always put it // after the unanchored starting state and it is therefore the // maximum. Why put unanchored followed by anchored? No particular // reason, but that's how the states are logically organized in the // Thompson NFA implementation found in regex-automata. ¯\_(ツ)_/¯ self.nfa.special.start_anchored_id } else { self.nfa.special.max_match_id }; self.nfa.sparse.shrink_to_fit(); self.nfa.dense.shrink_to_fit(); self.nfa.matches.shrink_to_fit(); self.nfa.pattern_lens.shrink_to_fit(); Ok(self.nfa) } /// This sets up the initial prefix trie that makes up the Aho-Corasick /// automaton. Effectively, it creates the basic structure of the /// automaton, where every pattern given has a path from the start state to /// the end of the pattern. fn build_trie(&mut self, patterns: I) -> Result<(), BuildError> where I: IntoIterator, P: AsRef<[u8]>, { 'PATTERNS: for (i, pat) in patterns.into_iter().enumerate() { let pid = PatternID::new(i).map_err(|e| { BuildError::pattern_id_overflow( PatternID::MAX.as_u64(), e.attempted(), ) })?; let pat = pat.as_ref(); let patlen = SmallIndex::new(pat.len()) .map_err(|_| BuildError::pattern_too_long(pid, pat.len()))?; self.nfa.min_pattern_len = core::cmp::min(self.nfa.min_pattern_len, pat.len()); self.nfa.max_pattern_len = core::cmp::max(self.nfa.max_pattern_len, pat.len()); assert_eq!( i, self.nfa.pattern_lens.len(), "expected number of patterns to match pattern ID" ); self.nfa.pattern_lens.push(patlen); // We add the pattern to the prefilter here because the pattern // ID in the prefilter is determined with respect to the patterns // added to the prefilter. That is, it isn't the ID we have here, // but the one determined by its own accounting of patterns. // To ensure they line up, we add every pattern we see to the // prefilter, even if some patterns ultimately are impossible to // match (in leftmost-first semantics specifically). // // Another way of doing this would be to expose an API in the // prefilter to permit setting your own pattern IDs. Or to just use // our own map and go between them. But this case is sufficiently // rare that we don't bother and just make sure they're in sync. if self.builder.prefilter { self.prefilter.add(pat); } let mut prev = self.nfa.special.start_unanchored_id; let mut saw_match = false; for (depth, &b) in pat.iter().enumerate() { // When leftmost-first match semantics are requested, we // specifically stop adding patterns when a previously added // pattern is a prefix of it. We avoid adding it because // leftmost-first semantics imply that the pattern can never // match. This is not just an optimization to save space! It // is necessary for correctness. In fact, this is the only // difference in the automaton between the implementations for // leftmost-first and leftmost-longest. saw_match = saw_match || self.nfa.states[prev].is_match(); if self.builder.match_kind.is_leftmost_first() && saw_match { // Skip to the next pattern immediately. This avoids // incorrectly adding a match after this loop terminates. continue 'PATTERNS; } // Add this byte to our equivalence classes. These don't // get used while building the trie, but other Aho-Corasick // implementations may use them. self.byteset.set_range(b, b); if self.builder.ascii_case_insensitive { let b = opposite_ascii_case(b); self.byteset.set_range(b, b); } // If the transition from prev using the current byte already // exists, then just move through it. Otherwise, add a new // state. We track the depth here so that we can determine // how to represent transitions. States near the start state // use a dense representation that uses more memory but is // faster. Other states use a sparse representation that uses // less memory but is slower. let next = self.nfa.follow_transition(prev, b); if next != NFA::FAIL { prev = next; } else { let next = self.nfa.alloc_state(depth)?; self.nfa.add_transition(prev, b, next)?; if self.builder.ascii_case_insensitive { let b = opposite_ascii_case(b); self.nfa.add_transition(prev, b, next)?; } prev = next; } } // Once the pattern has been added, log the match in the final // state that it reached. self.nfa.add_match(prev, pid)?; } Ok(()) } /// This routine creates failure transitions according to the standard /// textbook formulation of the Aho-Corasick algorithm, with a couple small /// tweaks to support "leftmost" semantics. /// /// Building failure transitions is the most interesting part of building /// the Aho-Corasick automaton, because they are what allow searches to /// be performed in linear time. Specifically, a failure transition is /// a single transition associated with each state that points back to /// the longest proper suffix of the pattern being searched. The failure /// transition is followed whenever there exists no transition on the /// current state for the current input byte. If there is no other proper /// suffix, then the failure transition points back to the starting state. /// /// For example, let's say we built an Aho-Corasick automaton with the /// following patterns: 'abcd' and 'cef'. The trie looks like this: /// /// ```ignore /// a - S1 - b - S2 - c - S3 - d - S4* /// / /// S0 - c - S5 - e - S6 - f - S7* /// ``` /// /// At this point, it should be fairly straight-forward to see how this /// trie can be used in a simplistic way. At any given position in the /// text we're searching (called the "subject" string), all we need to do /// is follow the transitions in the trie by consuming one transition for /// each byte in the subject string. If we reach a match state, then we can /// report that location as a match. /// /// The trick comes when searching a subject string like 'abcef'. We'll /// initially follow the transition from S0 to S1 and wind up in S3 after /// observng the 'c' byte. At this point, the next byte is 'e' but state /// S3 has no transition for 'e', so the search fails. We then would need /// to restart the search at the next position in 'abcef', which /// corresponds to 'b'. The match would fail, but the next search starting /// at 'c' would finally succeed. The problem with this approach is that /// we wind up searching the subject string potentially many times. In /// effect, this makes the algorithm have worst case `O(n * m)` complexity, /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead /// like to achieve a `O(n + m)` worst case complexity. /// /// This is where failure transitions come in. Instead of dying at S3 in /// the first search, the automaton can instruct the search to move to /// another part of the automaton that corresponds to a suffix of what /// we've seen so far. Recall that we've seen 'abc' in the subject string, /// and the automaton does indeed have a non-empty suffix, 'c', that could /// potentially lead to another match. Thus, the actual Aho-Corasick /// automaton for our patterns in this case looks like this: /// /// ```ignore /// a - S1 - b - S2 - c - S3 - d - S4* /// / / /// / ---------------- /// / / /// S0 - c - S5 - e - S6 - f - S7* /// ``` /// /// That is, we have a failure transition from S3 to S5, which is followed /// exactly in cases when we are in state S3 but see any byte other than /// 'd' (that is, we've "failed" to find a match in this portion of our /// trie). We know we can transition back to S5 because we've already seen /// a 'c' byte, so we don't need to re-scan it. We can then pick back up /// with the search starting at S5 and complete our match. /// /// Adding failure transitions to a trie is fairly simple, but subtle. The /// key issue is that you might have multiple failure transition that you /// need to follow. For example, look at the trie for the patterns /// 'abcd', 'b', 'bcd' and 'cd': /// /// ```ignore /// - a - S1 - b - S2* - c - S3 - d - S4* /// / / / /// / ------- ------- /// / / / /// S0 --- b - S5* - c - S6 - d - S7* /// \ / /// \ -------- /// \ / /// - c - S8 - d - S9* /// ``` /// /// The failure transitions for this trie are defined from S2 to S5, /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it /// corresponds to a match, since its failure transition to S5 is itself /// a match state. /// /// Perhaps simplest way to think about adding these failure transitions /// is recursively. That is, if you know the failure transitions for every /// possible previous state that could be visited (e.g., when computing the /// failure transition for S3, you already know the failure transitions /// for S0, S1 and S2), then you can simply follow the failure transition /// of the previous state and check whether the incoming transition is /// defined after following the failure transition. /// /// For example, when determining the failure state for S3, by our /// assumptions, we already know that there is a failure transition from /// S2 (the previous state) to S5. So we follow that transition and check /// whether the transition connecting S2 to S3 is defined. Indeed, it is, /// as there is a transition from S5 to S6 for the byte 'c'. If no such /// transition existed, we could keep following the failure transitions /// until we reach the start state, which is the failure transition for /// every state that has no corresponding proper suffix. /// /// We don't actually use recursion to implement this, but instead, use a /// breadth first search of the automaton. Our base case is the start /// state, whose failure transition is just a transition to itself. /// /// When building a leftmost automaton, we proceed as above, but only /// include a subset of failure transitions. Namely, we omit any failure /// transitions that appear after a match state in the trie. This is /// because failure transitions always point back to a proper suffix of /// what has been seen so far. Thus, following a failure transition after /// a match implies looking for a match that starts after the one that has /// already been seen, which is of course therefore not the leftmost match. /// /// N.B. I came up with this algorithm on my own, and after scouring all of /// the other AC implementations I know of (Perl, Snort, many on GitHub). /// I couldn't find any that implement leftmost semantics like this. /// Perl of course needs leftmost-first semantics, but they implement it /// with a seeming hack at *search* time instead of encoding it into the /// automaton. There are also a couple Java libraries that support leftmost /// longest semantics, but they do it by building a queue of matches at /// search time, which is even worse than what Perl is doing. ---AG fn fill_failure_transitions(&mut self) -> Result<(), BuildError> { let is_leftmost = self.builder.match_kind.is_leftmost(); let start_uid = self.nfa.special.start_unanchored_id; // Initialize the queue for breadth first search with all transitions // out of the start state. We handle the start state specially because // we only want to follow non-self transitions. If we followed self // transitions, then this would never terminate. let mut queue = VecDeque::new(); let mut seen = self.queued_set(); let mut prev_link = None; while let Some(link) = self.nfa.next_link(start_uid, prev_link) { prev_link = Some(link); let t = self.nfa.sparse[link]; // Skip anything we've seen before and any self-transitions on the // start state. if start_uid == t.next() || seen.contains(t.next) { continue; } queue.push_back(t.next); seen.insert(t.next); // Under leftmost semantics, if a state immediately following // the start state is a match state, then we never want to // follow its failure transition since the failure transition // necessarily leads back to the start state, which we never // want to do for leftmost matching after a match has been // found. // // We apply the same logic to non-start states below as well. if is_leftmost && self.nfa.states[t.next].is_match() { self.nfa.states[t.next].fail = NFA::DEAD; } } while let Some(id) = queue.pop_front() { let mut prev_link = None; while let Some(link) = self.nfa.next_link(id, prev_link) { prev_link = Some(link); let t = self.nfa.sparse[link]; if seen.contains(t.next) { // The only way to visit a duplicate state in a transition // list is when ASCII case insensitivity is enabled. In // this case, we want to skip it since it's redundant work. // But it would also end up duplicating matches, which // results in reporting duplicate matches in some cases. // See the 'acasei010' regression test. continue; } queue.push_back(t.next); seen.insert(t.next); // As above for start states, under leftmost semantics, once // we see a match all subsequent states should have no failure // transitions because failure transitions always imply looking // for a match that is a suffix of what has been seen so far // (where "seen so far" corresponds to the string formed by // following the transitions from the start state to the // current state). Under leftmost semantics, we specifically do // not want to allow this to happen because we always want to // report the match found at the leftmost position. // // The difference between leftmost-first and leftmost-longest // occurs previously while we build the trie. For // leftmost-first, we simply omit any entries that would // otherwise require passing through a match state. // // Note that for correctness, the failure transition has to be // set to the dead state for ALL states following a match, not // just the match state itself. However, by setting the failure // transition to the dead state on all match states, the dead // state will automatically propagate to all subsequent states // via the failure state computation below. if is_leftmost && self.nfa.states[t.next].is_match() { self.nfa.states[t.next].fail = NFA::DEAD; continue; } let mut fail = self.nfa.states[id].fail; while self.nfa.follow_transition(fail, t.byte) == NFA::FAIL { fail = self.nfa.states[fail].fail; } fail = self.nfa.follow_transition(fail, t.byte); self.nfa.states[t.next].fail = fail; self.nfa.copy_matches(fail, t.next)?; } // If the start state is a match state, then this automaton can // match the empty string. This implies all states are match states // since every position matches the empty string, so copy the // matches from the start state to every state. Strictly speaking, // this is only necessary for overlapping matches since each // non-empty non-start match state needs to report empty matches // in addition to its own. For the non-overlapping case, such // states only report the first match, which is never empty since // it isn't a start state. if !is_leftmost { self.nfa .copy_matches(self.nfa.special.start_unanchored_id, id)?; } } Ok(()) } /// Shuffle the states so that they appear in this sequence: /// /// DEAD, FAIL, MATCH..., START, START, NON-MATCH... /// /// The idea here is that if we know how special states are laid out in our /// transition table, then we can determine what "kind" of state we're in /// just by comparing our current state ID with a particular value. In this /// way, we avoid doing extra memory lookups. /// /// Before shuffling begins, our states look something like this: /// /// DEAD, FAIL, START, START, (MATCH | NON-MATCH)... /// /// So all we need to do is move all of the MATCH states so that they /// all appear before any NON-MATCH state, like so: /// /// DEAD, FAIL, START, START, MATCH... NON-MATCH... /// /// Then it's just a simple matter of swapping the two START states with /// the last two MATCH states. /// /// (This is the same technique used for fully compiled DFAs in /// regex-automata.) fn shuffle(&mut self) { let old_start_uid = self.nfa.special.start_unanchored_id; let old_start_aid = self.nfa.special.start_anchored_id; assert!(old_start_uid < old_start_aid); assert_eq!( 3, old_start_aid.as_usize(), "anchored start state should be at index 3" ); // We implement shuffling by a sequence of pairwise swaps of states. // Since we have a number of things referencing states via their // IDs and swapping them changes their IDs, we need to record every // swap we make so that we can remap IDs. The remapper handles this // book-keeping for us. let mut remapper = Remapper::new(&self.nfa, 0); // The way we proceed here is by moving all match states so that // they directly follow the start states. So it will go: DEAD, FAIL, // START-UNANCHORED, START-ANCHORED, MATCH, ..., NON-MATCH, ... // // To do that, we proceed forward through all states after // START-ANCHORED and swap match states so that they appear before all // non-match states. let mut next_avail = StateID::from(4u8); for i in next_avail.as_usize()..self.nfa.states.len() { let sid = StateID::new(i).unwrap(); if !self.nfa.states[sid].is_match() { continue; } remapper.swap(&mut self.nfa, sid, next_avail); // The key invariant here is that only non-match states exist // between 'next_avail' and 'sid' (with them being potentially // equivalent). Thus, incrementing 'next_avail' by 1 is guaranteed // to land on the leftmost non-match state. (Unless 'next_avail' // and 'sid' are equivalent, in which case, a swap will occur but // it is a no-op.) next_avail = StateID::new(next_avail.one_more()).unwrap(); } // Now we'd like to move the start states to immediately following the // match states. (The start states may themselves be match states, but // we'll handle that later.) We arrange the states this way so that we // don't necessarily need to check whether a state is a start state or // not before checking whether a state is a match state. For example, // we'd like to be able to write this as our state machine loop: // // sid = start() // for byte in haystack: // sid = next(sid, byte) // if sid <= nfa.max_start_id: // if sid <= nfa.max_dead_id: // # search complete // elif sid <= nfa.max_match_id: // # found match // // The important context here is that we might not want to look for // start states at all. Namely, if a searcher doesn't have a prefilter, // then there is no reason to care about whether we're in a start state // or not. And indeed, if we did check for it, this very hot loop would // ping pong between the special state handling and the main state // transition logic. This in turn stalls the CPU by killing branch // prediction. // // So essentially, we really want to be able to "forget" that start // states even exist and this is why we put them at the end. let new_start_aid = StateID::new(next_avail.as_usize().checked_sub(1).unwrap()) .unwrap(); remapper.swap(&mut self.nfa, old_start_aid, new_start_aid); let new_start_uid = StateID::new(next_avail.as_usize().checked_sub(2).unwrap()) .unwrap(); remapper.swap(&mut self.nfa, old_start_uid, new_start_uid); let new_max_match_id = StateID::new(next_avail.as_usize().checked_sub(3).unwrap()) .unwrap(); self.nfa.special.max_match_id = new_max_match_id; self.nfa.special.start_unanchored_id = new_start_uid; self.nfa.special.start_anchored_id = new_start_aid; // If one start state is a match state, then they both are. if self.nfa.states[self.nfa.special.start_anchored_id].is_match() { self.nfa.special.max_match_id = self.nfa.special.start_anchored_id; } remapper.remap(&mut self.nfa); } /// Attempts to convert the transition representation of a subset of states /// in this NFA from sparse to dense. This can greatly improve search /// performance since states with a higher number of transitions tend to /// correlate with very active states. /// /// We generally only densify states that are close to the start state. /// These tend to be the most active states and thus benefit from a dense /// representation more than other states. /// /// This tends to best balance between memory usage and performance. In /// particular, the *vast majority* of all states in a typical Aho-Corasick /// automaton have only 1 transition and are usually farther from the start /// state and thus don't get densified. /// /// Note that this doesn't remove the sparse representation of transitions /// for states that are densified. It could be done, but actually removing /// entries from `NFA::sparse` is likely more expensive than it's worth. fn densify(&mut self) -> Result<(), BuildError> { for i in 0..self.nfa.states.len() { let sid = StateID::new(i).unwrap(); // Don't bother densifying states that are only used as sentinels. if sid == NFA::DEAD || sid == NFA::FAIL { continue; } // Only densify states that are "close enough" to the start state. if self.nfa.states[sid].depth.as_usize() >= self.builder.dense_depth { continue; } let dense = self.nfa.alloc_dense_state()?; let mut prev_link = None; while let Some(link) = self.nfa.next_link(sid, prev_link) { prev_link = Some(link); let t = self.nfa.sparse[link]; let class = usize::from(self.nfa.byte_classes.get(t.byte)); let index = dense.as_usize() + class; self.nfa.dense[index] = t.next; } self.nfa.states[sid].dense = dense; } Ok(()) } /// Returns a set that tracked queued states. /// /// This is only necessary when ASCII case insensitivity is enabled, since /// it is the only way to visit the same state twice. Otherwise, this /// returns an inert set that nevers adds anything and always reports /// `false` for every member test. fn queued_set(&self) -> QueuedSet { if self.builder.ascii_case_insensitive { QueuedSet::active() } else { QueuedSet::inert() } } /// Initializes the unanchored start state by making it dense. This is /// achieved by explicitly setting every transition to the FAIL state. /// This isn't necessary for correctness, since any missing transition is /// automatically assumed to be mapped to the FAIL state. We do this to /// make the unanchored starting state dense, and thus in turn make /// transition lookups on it faster. (Which is worth doing because it's /// the most active state.) fn init_unanchored_start_state(&mut self) -> Result<(), BuildError> { let start_uid = self.nfa.special.start_unanchored_id; let start_aid = self.nfa.special.start_anchored_id; self.nfa.init_full_state(start_uid, NFA::FAIL)?; self.nfa.init_full_state(start_aid, NFA::FAIL)?; Ok(()) } /// Setup the anchored start state by copying all of the transitions and /// matches from the unanchored starting state with one change: the failure /// transition is changed to the DEAD state, so that for any undefined /// transitions, the search will stop. fn set_anchored_start_state(&mut self) -> Result<(), BuildError> { let start_uid = self.nfa.special.start_unanchored_id; let start_aid = self.nfa.special.start_anchored_id; let (mut uprev_link, mut aprev_link) = (None, None); loop { let unext = self.nfa.next_link(start_uid, uprev_link); let anext = self.nfa.next_link(start_aid, aprev_link); let (ulink, alink) = match (unext, anext) { (Some(ulink), Some(alink)) => (ulink, alink), (None, None) => break, _ => unreachable!(), }; uprev_link = Some(ulink); aprev_link = Some(alink); self.nfa.sparse[alink].next = self.nfa.sparse[ulink].next; } self.nfa.copy_matches(start_uid, start_aid)?; // This is the main difference between the unanchored and anchored // starting states. If a lookup on an anchored starting state fails, // then the search should stop. // // N.B. This assumes that the loop on the unanchored starting state // hasn't been created yet. self.nfa.states[start_aid].fail = NFA::DEAD; Ok(()) } /// Set the failure transitions on the start state to loop back to the /// start state. This effectively permits the Aho-Corasick automaton to /// match at any position. This is also required for finding the next /// state to terminate, namely, finding the next state should never return /// a fail_id. /// /// This must be done after building the initial trie, since trie /// construction depends on transitions to `fail_id` to determine whether a /// state already exists or not. fn add_unanchored_start_state_loop(&mut self) { let start_uid = self.nfa.special.start_unanchored_id; let mut prev_link = None; while let Some(link) = self.nfa.next_link(start_uid, prev_link) { prev_link = Some(link); if self.nfa.sparse[link].next() == NFA::FAIL { self.nfa.sparse[link].next = start_uid; } } } /// Remove the start state loop by rewriting any transitions on the start /// state back to the start state with transitions to the dead state. /// /// The loop is only closed when two conditions are met: the start state /// is a match state and the match kind is leftmost-first or /// leftmost-longest. /// /// The reason for this is that under leftmost semantics, a start state /// that is also a match implies that we should never restart the search /// process. We allow normal transitions out of the start state, but if /// none exist, we transition to the dead state, which signals that /// searching should stop. fn close_start_state_loop_for_leftmost(&mut self) { let start_uid = self.nfa.special.start_unanchored_id; let start = &mut self.nfa.states[start_uid]; let dense = start.dense; if self.builder.match_kind.is_leftmost() && start.is_match() { let mut prev_link = None; while let Some(link) = self.nfa.next_link(start_uid, prev_link) { prev_link = Some(link); if self.nfa.sparse[link].next() == start_uid { self.nfa.sparse[link].next = NFA::DEAD; if dense != StateID::ZERO { let b = self.nfa.sparse[link].byte; let class = usize::from(self.nfa.byte_classes.get(b)); self.nfa.dense[dense.as_usize() + class] = NFA::DEAD; } } } } } /// Sets all transitions on the dead state to point back to the dead state. /// Normally, missing transitions map back to the failure state, but the /// point of the dead state is to act as a sink that can never be escaped. fn add_dead_state_loop(&mut self) -> Result<(), BuildError> { self.nfa.init_full_state(NFA::DEAD, NFA::DEAD)?; Ok(()) } } /// A set of state identifiers used to avoid revisiting the same state multiple /// times when filling in failure transitions. /// /// This set has an "inert" and an "active" mode. When inert, the set never /// stores anything and always returns `false` for every member test. This is /// useful to avoid the performance and memory overhead of maintaining this /// set when it is not needed. #[derive(Debug)] struct QueuedSet { set: Option>, } impl QueuedSet { /// Return an inert set that returns `false` for every state ID membership /// test. fn inert() -> QueuedSet { QueuedSet { set: None } } /// Return an active set that tracks state ID membership. fn active() -> QueuedSet { QueuedSet { set: Some(BTreeSet::new()) } } /// Inserts the given state ID into this set. (If the set is inert, then /// this is a no-op.) fn insert(&mut self, state_id: StateID) { if let Some(ref mut set) = self.set { set.insert(state_id); } } /// Returns true if and only if the given state ID is in this set. If the /// set is inert, this always returns false. fn contains(&self, state_id: StateID) -> bool { match self.set { None => false, Some(ref set) => set.contains(&state_id), } } } impl core::fmt::Debug for NFA { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use crate::{ automaton::{fmt_state_indicator, sparse_transitions}, util::debug::DebugByte, }; writeln!(f, "noncontiguous::NFA(")?; for (sid, state) in self.states.iter().with_state_ids() { // The FAIL state doesn't actually have space for a state allocated // for it, so we have to treat it as a special case. if sid == NFA::FAIL { writeln!(f, "F {:06}:", sid.as_usize())?; continue; } fmt_state_indicator(f, self, sid)?; write!( f, "{:06}({:06}): ", sid.as_usize(), state.fail.as_usize() )?; let it = sparse_transitions( self.iter_trans(sid).map(|t| (t.byte, t.next)), ) .enumerate(); for (i, (start, end, sid)) in it { if i > 0 { write!(f, ", ")?; } if start == end { write!( f, "{:?} => {:?}", DebugByte(start), sid.as_usize() )?; } else { write!( f, "{:?}-{:?} => {:?}", DebugByte(start), DebugByte(end), sid.as_usize() )?; } } write!(f, "\n")?; if self.is_match(sid) { write!(f, " matches: ")?; for (i, pid) in self.iter_matches(sid).enumerate() { if i > 0 { write!(f, ", ")?; } write!(f, "{}", pid.as_usize())?; } write!(f, "\n")?; } } writeln!(f, "match kind: {:?}", self.match_kind)?; writeln!(f, "prefilter: {:?}", self.prefilter.is_some())?; writeln!(f, "state length: {:?}", self.states.len())?; writeln!(f, "pattern length: {:?}", self.patterns_len())?; writeln!(f, "shortest pattern length: {:?}", self.min_pattern_len)?; writeln!(f, "longest pattern length: {:?}", self.max_pattern_len)?; writeln!(f, "memory usage: {:?}", self.memory_usage())?; writeln!(f, ")")?; Ok(()) } } aho-corasick-1.1.3/src/packed/api.rs000064400000000000000000000547211046102023000153370ustar 00000000000000use alloc::sync::Arc; use crate::{ packed::{pattern::Patterns, rabinkarp::RabinKarp, teddy}, util::search::{Match, Span}, }; /// This is a limit placed on the total number of patterns we're willing to try /// and match at once. As more sophisticated algorithms are added, this number /// may be increased. const PATTERN_LIMIT: usize = 128; /// A knob for controlling the match semantics of a packed multiple string /// searcher. /// /// This differs from the [`MatchKind`](crate::MatchKind) type in the top-level /// crate module in that it doesn't support "standard" match semantics, /// and instead only supports leftmost-first or leftmost-longest. Namely, /// "standard" semantics cannot be easily supported by packed searchers. /// /// For more information on the distinction between leftmost-first and /// leftmost-longest, see the docs on the top-level `MatchKind` type. /// /// Unlike the top-level `MatchKind` type, the default match semantics for this /// type are leftmost-first. #[derive(Clone, Copy, Debug, Eq, PartialEq)] #[non_exhaustive] pub enum MatchKind { /// Use leftmost-first match semantics, which reports leftmost matches. /// When there are multiple possible leftmost matches, the match /// corresponding to the pattern that appeared earlier when constructing /// the automaton is reported. /// /// This is the default. LeftmostFirst, /// Use leftmost-longest match semantics, which reports leftmost matches. /// When there are multiple possible leftmost matches, the longest match /// is chosen. LeftmostLongest, } impl Default for MatchKind { fn default() -> MatchKind { MatchKind::LeftmostFirst } } /// The configuration for a packed multiple pattern searcher. /// /// The configuration is currently limited only to being able to select the /// match semantics (leftmost-first or leftmost-longest) of a searcher. In the /// future, more knobs may be made available. /// /// A configuration produces a [`packed::Builder`](Builder), which in turn can /// be used to construct a [`packed::Searcher`](Searcher) for searching. /// /// # Example /// /// This example shows how to use leftmost-longest semantics instead of the /// default (leftmost-first). /// /// ``` /// use aho_corasick::{packed::{Config, MatchKind}, PatternID}; /// /// # fn example() -> Option<()> { /// let searcher = Config::new() /// .match_kind(MatchKind::LeftmostLongest) /// .builder() /// .add("foo") /// .add("foobar") /// .build()?; /// let matches: Vec = searcher /// .find_iter("foobar") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![PatternID::must(1)], matches); /// # Some(()) } /// # if cfg!(all(feature = "std", any( /// # target_arch = "x86_64", target_arch = "aarch64", /// # ))) { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[derive(Clone, Debug)] pub struct Config { kind: MatchKind, force: Option, only_teddy_fat: Option, only_teddy_256bit: Option, heuristic_pattern_limits: bool, } /// An internal option for forcing the use of a particular packed algorithm. /// /// When an algorithm is forced, if a searcher could not be constructed for it, /// then no searcher will be returned even if an alternative algorithm would /// work. #[derive(Clone, Debug)] enum ForceAlgorithm { Teddy, RabinKarp, } impl Default for Config { fn default() -> Config { Config::new() } } impl Config { /// Create a new default configuration. A default configuration uses /// leftmost-first match semantics. pub fn new() -> Config { Config { kind: MatchKind::LeftmostFirst, force: None, only_teddy_fat: None, only_teddy_256bit: None, heuristic_pattern_limits: true, } } /// Create a packed builder from this configuration. The builder can be /// used to accumulate patterns and create a [`Searcher`] from them. pub fn builder(&self) -> Builder { Builder::from_config(self.clone()) } /// Set the match semantics for this configuration. pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { self.kind = kind; self } /// An undocumented method for forcing the use of the Teddy algorithm. /// /// This is only exposed for more precise testing and benchmarks. Callers /// should not use it as it is not part of the API stability guarantees of /// this crate. #[doc(hidden)] pub fn only_teddy(&mut self, yes: bool) -> &mut Config { if yes { self.force = Some(ForceAlgorithm::Teddy); } else { self.force = None; } self } /// An undocumented method for forcing the use of the Fat Teddy algorithm. /// /// This is only exposed for more precise testing and benchmarks. Callers /// should not use it as it is not part of the API stability guarantees of /// this crate. #[doc(hidden)] pub fn only_teddy_fat(&mut self, yes: Option) -> &mut Config { self.only_teddy_fat = yes; self } /// An undocumented method for forcing the use of SSE (`Some(false)`) or /// AVX (`Some(true)`) algorithms. /// /// This is only exposed for more precise testing and benchmarks. Callers /// should not use it as it is not part of the API stability guarantees of /// this crate. #[doc(hidden)] pub fn only_teddy_256bit(&mut self, yes: Option) -> &mut Config { self.only_teddy_256bit = yes; self } /// An undocumented method for forcing the use of the Rabin-Karp algorithm. /// /// This is only exposed for more precise testing and benchmarks. Callers /// should not use it as it is not part of the API stability guarantees of /// this crate. #[doc(hidden)] pub fn only_rabin_karp(&mut self, yes: bool) -> &mut Config { if yes { self.force = Some(ForceAlgorithm::RabinKarp); } else { self.force = None; } self } /// Request that heuristic limitations on the number of patterns be /// employed. This useful to disable for benchmarking where one wants to /// explore how Teddy performs on large number of patterns even if the /// heuristics would otherwise refuse construction. /// /// This is enabled by default. pub fn heuristic_pattern_limits(&mut self, yes: bool) -> &mut Config { self.heuristic_pattern_limits = yes; self } } /// A builder for constructing a packed searcher from a collection of patterns. /// /// # Example /// /// This example shows how to use a builder to construct a searcher. By /// default, leftmost-first match semantics are used. /// /// ``` /// use aho_corasick::{packed::{Builder, MatchKind}, PatternID}; /// /// # fn example() -> Option<()> { /// let searcher = Builder::new() /// .add("foobar") /// .add("foo") /// .build()?; /// let matches: Vec = searcher /// .find_iter("foobar") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![PatternID::ZERO], matches); /// # Some(()) } /// # if cfg!(all(feature = "std", any( /// # target_arch = "x86_64", target_arch = "aarch64", /// # ))) { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[derive(Clone, Debug)] pub struct Builder { /// The configuration of this builder and subsequent matcher. config: Config, /// Set to true if the builder detects that a matcher cannot be built. inert: bool, /// The patterns provided by the caller. patterns: Patterns, } impl Builder { /// Create a new builder for constructing a multi-pattern searcher. This /// constructor uses the default configuration. pub fn new() -> Builder { Builder::from_config(Config::new()) } fn from_config(config: Config) -> Builder { Builder { config, inert: false, patterns: Patterns::new() } } /// Build a searcher from the patterns added to this builder so far. pub fn build(&self) -> Option { if self.inert || self.patterns.is_empty() { return None; } let mut patterns = self.patterns.clone(); patterns.set_match_kind(self.config.kind); let patterns = Arc::new(patterns); let rabinkarp = RabinKarp::new(&patterns); // Effectively, we only want to return a searcher if we can use Teddy, // since Teddy is our only fast packed searcher at the moment. // Rabin-Karp is only used when searching haystacks smaller than what // Teddy can support. Thus, the only way to get a Rabin-Karp searcher // is to force it using undocumented APIs (for tests/benchmarks). let (search_kind, minimum_len) = match self.config.force { None | Some(ForceAlgorithm::Teddy) => { debug!("trying to build Teddy packed matcher"); let teddy = match self.build_teddy(Arc::clone(&patterns)) { None => return None, Some(teddy) => teddy, }; let minimum_len = teddy.minimum_len(); (SearchKind::Teddy(teddy), minimum_len) } Some(ForceAlgorithm::RabinKarp) => { debug!("using Rabin-Karp packed matcher"); (SearchKind::RabinKarp, 0) } }; Some(Searcher { patterns, rabinkarp, search_kind, minimum_len }) } fn build_teddy(&self, patterns: Arc) -> Option { teddy::Builder::new() .only_256bit(self.config.only_teddy_256bit) .only_fat(self.config.only_teddy_fat) .heuristic_pattern_limits(self.config.heuristic_pattern_limits) .build(patterns) } /// Add the given pattern to this set to match. /// /// The order in which patterns are added is significant. Namely, when /// using leftmost-first match semantics, then when multiple patterns can /// match at a particular location, the pattern that was added first is /// used as the match. /// /// If the number of patterns added exceeds the amount supported by packed /// searchers, then the builder will stop accumulating patterns and render /// itself inert. At this point, constructing a searcher will always return /// `None`. pub fn add>(&mut self, pattern: P) -> &mut Builder { if self.inert { return self; } else if self.patterns.len() >= PATTERN_LIMIT { self.inert = true; self.patterns.reset(); return self; } // Just in case PATTERN_LIMIT increases beyond u16::MAX. assert!(self.patterns.len() <= core::u16::MAX as usize); let pattern = pattern.as_ref(); if pattern.is_empty() { self.inert = true; self.patterns.reset(); return self; } self.patterns.add(pattern); self } /// Add the given iterator of patterns to this set to match. /// /// The iterator must yield elements that can be converted into a `&[u8]`. /// /// The order in which patterns are added is significant. Namely, when /// using leftmost-first match semantics, then when multiple patterns can /// match at a particular location, the pattern that was added first is /// used as the match. /// /// If the number of patterns added exceeds the amount supported by packed /// searchers, then the builder will stop accumulating patterns and render /// itself inert. At this point, constructing a searcher will always return /// `None`. pub fn extend(&mut self, patterns: I) -> &mut Builder where I: IntoIterator, P: AsRef<[u8]>, { for p in patterns { self.add(p); } self } /// Returns the number of patterns added to this builder. pub fn len(&self) -> usize { self.patterns.len() } /// Returns the length, in bytes, of the shortest pattern added. pub fn minimum_len(&self) -> usize { self.patterns.minimum_len() } } impl Default for Builder { fn default() -> Builder { Builder::new() } } /// A packed searcher for quickly finding occurrences of multiple patterns. /// /// If callers need more flexible construction, or if one wants to change the /// match semantics (either leftmost-first or leftmost-longest), then one can /// use the [`Config`] and/or [`Builder`] types for more fine grained control. /// /// # Example /// /// This example shows how to create a searcher from an iterator of patterns. /// By default, leftmost-first match semantics are used. /// /// ``` /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let matches: Vec = searcher /// .find_iter("foobar") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![PatternID::ZERO], matches); /// # Some(()) } /// # if cfg!(all(feature = "std", any( /// # target_arch = "x86_64", target_arch = "aarch64", /// # ))) { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[derive(Clone, Debug)] pub struct Searcher { patterns: Arc, rabinkarp: RabinKarp, search_kind: SearchKind, minimum_len: usize, } #[derive(Clone, Debug)] enum SearchKind { Teddy(teddy::Searcher), RabinKarp, } impl Searcher { /// A convenience function for constructing a searcher from an iterator /// of things that can be converted to a `&[u8]`. /// /// If a searcher could not be constructed (either because of an /// unsupported CPU or because there are too many patterns), then `None` /// is returned. /// /// # Example /// /// Basic usage: /// /// ``` /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let matches: Vec = searcher /// .find_iter("foobar") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![PatternID::ZERO], matches); /// # Some(()) } /// # if cfg!(all(feature = "std", any( /// # target_arch = "x86_64", target_arch = "aarch64", /// # ))) { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` pub fn new(patterns: I) -> Option where I: IntoIterator, P: AsRef<[u8]>, { Builder::new().extend(patterns).build() } /// A convenience function for calling `Config::new()`. /// /// This is useful for avoiding an additional import. pub fn config() -> Config { Config::new() } /// A convenience function for calling `Builder::new()`. /// /// This is useful for avoiding an additional import. pub fn builder() -> Builder { Builder::new() } /// Return the first occurrence of any of the patterns in this searcher, /// according to its match semantics, in the given haystack. The `Match` /// returned will include the identifier of the pattern that matched, which /// corresponds to the index of the pattern (starting from `0`) in which it /// was added. /// /// # Example /// /// Basic usage: /// /// ``` /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let mat = searcher.find("foobar")?; /// assert_eq!(PatternID::ZERO, mat.pattern()); /// assert_eq!(0, mat.start()); /// assert_eq!(6, mat.end()); /// # Some(()) } /// # if cfg!(all(feature = "std", any( /// # target_arch = "x86_64", target_arch = "aarch64", /// # ))) { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[inline] pub fn find>(&self, haystack: B) -> Option { let haystack = haystack.as_ref(); self.find_in(haystack, Span::from(0..haystack.len())) } /// Return the first occurrence of any of the patterns in this searcher, /// according to its match semantics, in the given haystack starting from /// the given position. /// /// The `Match` returned will include the identifier of the pattern that /// matched, which corresponds to the index of the pattern (starting from /// `0`) in which it was added. The offsets in the `Match` will be relative /// to the start of `haystack` (and not `at`). /// /// # Example /// /// Basic usage: /// /// ``` /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID, Span}; /// /// # fn example() -> Option<()> { /// let haystack = "foofoobar"; /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let mat = searcher.find_in(haystack, Span::from(3..haystack.len()))?; /// assert_eq!(PatternID::ZERO, mat.pattern()); /// assert_eq!(3, mat.start()); /// assert_eq!(9, mat.end()); /// # Some(()) } /// # if cfg!(all(feature = "std", any( /// # target_arch = "x86_64", target_arch = "aarch64", /// # ))) { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[inline] pub fn find_in>( &self, haystack: B, span: Span, ) -> Option { let haystack = haystack.as_ref(); match self.search_kind { SearchKind::Teddy(ref teddy) => { if haystack[span].len() < teddy.minimum_len() { return self.find_in_slow(haystack, span); } teddy.find(&haystack[..span.end], span.start) } SearchKind::RabinKarp => { self.rabinkarp.find_at(&haystack[..span.end], span.start) } } } /// Return an iterator of non-overlapping occurrences of the patterns in /// this searcher, according to its match semantics, in the given haystack. /// /// # Example /// /// Basic usage: /// /// ``` /// use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let matches: Vec = searcher /// .find_iter("foobar fooba foofoo") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![ /// PatternID::must(0), /// PatternID::must(1), /// PatternID::must(1), /// PatternID::must(1), /// ], matches); /// # Some(()) } /// # if cfg!(all(feature = "std", any( /// # target_arch = "x86_64", target_arch = "aarch64", /// # ))) { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[inline] pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindIter<'a, 'b> { let haystack = haystack.as_ref(); let span = Span::from(0..haystack.len()); FindIter { searcher: self, haystack, span } } /// Returns the match kind used by this packed searcher. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::packed::{MatchKind, Searcher}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// // leftmost-first is the default. /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind()); /// # Some(()) } /// # if cfg!(all(feature = "std", any( /// # target_arch = "x86_64", target_arch = "aarch64", /// # ))) { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[inline] pub fn match_kind(&self) -> &MatchKind { self.patterns.match_kind() } /// Returns the minimum length of a haystack that is required in order for /// packed searching to be effective. /// /// In some cases, the underlying packed searcher may not be able to search /// very short haystacks. When that occurs, the implementation will defer /// to a slower non-packed searcher (which is still generally faster than /// Aho-Corasick for a small number of patterns). However, callers may /// want to avoid ever using the slower variant, which one can do by /// never passing a haystack shorter than the minimum length returned by /// this method. #[inline] pub fn minimum_len(&self) -> usize { self.minimum_len } /// Returns the approximate total amount of heap used by this searcher, in /// units of bytes. #[inline] pub fn memory_usage(&self) -> usize { self.patterns.memory_usage() + self.rabinkarp.memory_usage() + self.search_kind.memory_usage() } /// Use a slow (non-packed) searcher. /// /// This is useful when a packed searcher could be constructed, but could /// not be used to search a specific haystack. For example, if Teddy was /// built but the haystack is smaller than ~34 bytes, then Teddy might not /// be able to run. fn find_in_slow(&self, haystack: &[u8], span: Span) -> Option { self.rabinkarp.find_at(&haystack[..span.end], span.start) } } impl SearchKind { fn memory_usage(&self) -> usize { match *self { SearchKind::Teddy(ref ted) => ted.memory_usage(), SearchKind::RabinKarp => 0, } } } /// An iterator over non-overlapping matches from a packed searcher. /// /// The lifetime `'s` refers to the lifetime of the underlying [`Searcher`], /// while the lifetime `'h` refers to the lifetime of the haystack being /// searched. #[derive(Debug)] pub struct FindIter<'s, 'h> { searcher: &'s Searcher, haystack: &'h [u8], span: Span, } impl<'s, 'h> Iterator for FindIter<'s, 'h> { type Item = Match; fn next(&mut self) -> Option { if self.span.start > self.span.end { return None; } match self.searcher.find_in(&self.haystack, self.span) { None => None, Some(m) => { self.span.start = m.end(); Some(m) } } } } aho-corasick-1.1.3/src/packed/ext.rs000064400000000000000000000023631046102023000153610ustar 00000000000000/// A trait for adding some helper routines to pointers. pub(crate) trait Pointer { /// Returns the distance, in units of `T`, between `self` and `origin`. /// /// # Safety /// /// Same as `ptr::offset_from` in addition to `self >= origin`. unsafe fn distance(self, origin: Self) -> usize; /// Casts this pointer to `usize`. /// /// Callers should not convert the `usize` back to a pointer if at all /// possible. (And if you believe it's necessary, open an issue to discuss /// why. Otherwise, it has the potential to violate pointer provenance.) /// The purpose of this function is just to be able to do arithmetic, i.e., /// computing offsets or alignments. fn as_usize(self) -> usize; } impl Pointer for *const T { unsafe fn distance(self, origin: *const T) -> usize { // TODO: Replace with `ptr::sub_ptr` once stabilized. usize::try_from(self.offset_from(origin)).unwrap_unchecked() } fn as_usize(self) -> usize { self as usize } } impl Pointer for *mut T { unsafe fn distance(self, origin: *mut T) -> usize { (self as *const T).distance(origin as *const T) } fn as_usize(self) -> usize { (self as *const T).as_usize() } } aho-corasick-1.1.3/src/packed/mod.rs000064400000000000000000000105461046102023000153420ustar 00000000000000/*! Provides packed multiple substring search, principally for a small number of patterns. This sub-module provides vectorized routines for quickly finding matches of a small number of patterns. In general, users of this crate shouldn't need to interface with this module directly, as the primary [`AhoCorasick`](crate::AhoCorasick) searcher will use these routines automatically as a prefilter when applicable. However, in some cases, callers may want to bypass the Aho-Corasick machinery entirely and use this vectorized searcher directly. # Overview The primary types in this sub-module are: * [`Searcher`] executes the actual search algorithm to report matches in a haystack. * [`Builder`] accumulates patterns incrementally and can construct a `Searcher`. * [`Config`] permits tuning the searcher, and itself will produce a `Builder` (which can then be used to build a `Searcher`). Currently, the only tuneable knob are the match semantics, but this may be expanded in the future. # Examples This example shows how to create a searcher from an iterator of patterns. By default, leftmost-first match semantics are used. (See the top-level [`MatchKind`] type for more details about match semantics, which apply similarly to packed substring search.) ``` use aho_corasick::{packed::{MatchKind, Searcher}, PatternID}; # fn example() -> Option<()> { let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; let matches: Vec = searcher .find_iter("foobar") .map(|mat| mat.pattern()) .collect(); assert_eq!(vec![PatternID::ZERO], matches); # Some(()) } # if cfg!(all(feature = "std", any( # target_arch = "x86_64", target_arch = "aarch64", # ))) { # example().unwrap() # } else { # assert!(example().is_none()); # } ``` This example shows how to use [`Config`] to change the match semantics to leftmost-longest: ``` use aho_corasick::{packed::{Config, MatchKind}, PatternID}; # fn example() -> Option<()> { let searcher = Config::new() .match_kind(MatchKind::LeftmostLongest) .builder() .add("foo") .add("foobar") .build()?; let matches: Vec = searcher .find_iter("foobar") .map(|mat| mat.pattern()) .collect(); assert_eq!(vec![PatternID::must(1)], matches); # Some(()) } # if cfg!(all(feature = "std", any( # target_arch = "x86_64", target_arch = "aarch64", # ))) { # example().unwrap() # } else { # assert!(example().is_none()); # } ``` # Packed substring searching Packed substring searching refers to the use of SIMD (Single Instruction, Multiple Data) to accelerate the detection of matches in a haystack. Unlike conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring search tend to do better with a small number of patterns, where as Aho-Corasick generally maintains reasonably consistent performance regardless of the number of patterns you give it. Because of this, the vectorized searcher in this sub-module cannot be used as a general purpose searcher, since building the searcher may fail even when given a small number of patterns. However, in exchange, when searching for a small number of patterns, searching can be quite a bit faster than Aho-Corasick (sometimes by an order of magnitude). The key take away here is that constructing a searcher from a list of patterns is a fallible operation with no clear rules for when it will fail. While the precise conditions under which building a searcher can fail is specifically an implementation detail, here are some common reasons: * Too many patterns were given. Typically, the limit is on the order of 100 or so, but this limit may fluctuate based on available CPU features. * The available packed algorithms require CPU features that aren't available. For example, currently, this crate only provides packed algorithms for `x86_64` and `aarch64`. Therefore, constructing a packed searcher on any other target will always fail. * Zero patterns were given, or one of the patterns given was empty. Packed searchers require at least one pattern and that all patterns are non-empty. * Something else about the nature of the patterns (typically based on heuristics) suggests that a packed searcher would perform very poorly, so no searcher is built. */ pub use crate::packed::api::{Builder, Config, FindIter, MatchKind, Searcher}; mod api; mod ext; mod pattern; mod rabinkarp; mod teddy; #[cfg(all(feature = "std", test))] mod tests; mod vector; aho-corasick-1.1.3/src/packed/pattern.rs000064400000000000000000000425751046102023000162470ustar 00000000000000use core::{cmp, fmt, mem, u16, usize}; use alloc::{boxed::Box, string::String, vec, vec::Vec}; use crate::{ packed::{api::MatchKind, ext::Pointer}, PatternID, }; /// A non-empty collection of non-empty patterns to search for. /// /// This collection of patterns is what is passed around to both execute /// searches and to construct the searchers themselves. Namely, this permits /// searches to avoid copying all of the patterns, and allows us to keep only /// one copy throughout all packed searchers. /// /// Note that this collection is not a set. The same pattern can appear more /// than once. #[derive(Clone, Debug)] pub(crate) struct Patterns { /// The match semantics supported by this collection of patterns. /// /// The match semantics determines the order of the iterator over patterns. /// For leftmost-first, patterns are provided in the same order as were /// provided by the caller. For leftmost-longest, patterns are provided in /// descending order of length, with ties broken by the order in which they /// were provided by the caller. kind: MatchKind, /// The collection of patterns, indexed by their identifier. by_id: Vec>, /// The order of patterns defined for iteration, given by pattern /// identifiers. The order of `by_id` and `order` is always the same for /// leftmost-first semantics, but may be different for leftmost-longest /// semantics. order: Vec, /// The length of the smallest pattern, in bytes. minimum_len: usize, /// The total number of pattern bytes across the entire collection. This /// is used for reporting total heap usage in constant time. total_pattern_bytes: usize, } // BREADCRUMBS: I think we want to experiment with a different bucket // representation. Basically, each bucket is just a Range to a single // contiguous allocation? Maybe length-prefixed patterns or something? The // idea is to try to get rid of the pointer chasing in verification. I don't // know that that is the issue, but I suspect it is. impl Patterns { /// Create a new collection of patterns for the given match semantics. The /// ID of each pattern is the index of the pattern at which it occurs in /// the `by_id` slice. /// /// If any of the patterns in the slice given are empty, then this panics. /// Similarly, if the number of patterns given is zero, then this also /// panics. pub(crate) fn new() -> Patterns { Patterns { kind: MatchKind::default(), by_id: vec![], order: vec![], minimum_len: usize::MAX, total_pattern_bytes: 0, } } /// Add a pattern to this collection. /// /// This panics if the pattern given is empty. pub(crate) fn add(&mut self, bytes: &[u8]) { assert!(!bytes.is_empty()); assert!(self.by_id.len() <= u16::MAX as usize); let id = PatternID::new(self.by_id.len()).unwrap(); self.order.push(id); self.by_id.push(bytes.to_vec()); self.minimum_len = cmp::min(self.minimum_len, bytes.len()); self.total_pattern_bytes += bytes.len(); } /// Set the match kind semantics for this collection of patterns. /// /// If the kind is not set, then the default is leftmost-first. pub(crate) fn set_match_kind(&mut self, kind: MatchKind) { self.kind = kind; match self.kind { MatchKind::LeftmostFirst => { self.order.sort(); } MatchKind::LeftmostLongest => { let (order, by_id) = (&mut self.order, &mut self.by_id); order.sort_by(|&id1, &id2| { by_id[id1].len().cmp(&by_id[id2].len()).reverse() }); } } } /// Return the number of patterns in this collection. /// /// This is guaranteed to be greater than zero. pub(crate) fn len(&self) -> usize { self.by_id.len() } /// Returns true if and only if this collection of patterns is empty. pub(crate) fn is_empty(&self) -> bool { self.len() == 0 } /// Returns the approximate total amount of heap used by these patterns, in /// units of bytes. pub(crate) fn memory_usage(&self) -> usize { self.order.len() * mem::size_of::() + self.by_id.len() * mem::size_of::>() + self.total_pattern_bytes } /// Clears all heap memory associated with this collection of patterns and /// resets all state such that it is a valid empty collection. pub(crate) fn reset(&mut self) { self.kind = MatchKind::default(); self.by_id.clear(); self.order.clear(); self.minimum_len = usize::MAX; } /// Returns the length, in bytes, of the smallest pattern. /// /// This is guaranteed to be at least one. pub(crate) fn minimum_len(&self) -> usize { self.minimum_len } /// Returns the match semantics used by these patterns. pub(crate) fn match_kind(&self) -> &MatchKind { &self.kind } /// Return the pattern with the given identifier. If such a pattern does /// not exist, then this panics. pub(crate) fn get(&self, id: PatternID) -> Pattern<'_> { Pattern(&self.by_id[id]) } /// Return the pattern with the given identifier without performing bounds /// checks. /// /// # Safety /// /// Callers must ensure that a pattern with the given identifier exists /// before using this method. pub(crate) unsafe fn get_unchecked(&self, id: PatternID) -> Pattern<'_> { Pattern(self.by_id.get_unchecked(id.as_usize())) } /// Return an iterator over all the patterns in this collection, in the /// order in which they should be matched. /// /// Specifically, in a naive multi-pattern matcher, the following is /// guaranteed to satisfy the match semantics of this collection of /// patterns: /// /// ```ignore /// for i in 0..haystack.len(): /// for p in patterns.iter(): /// if haystack[i..].starts_with(p.bytes()): /// return Match(p.id(), i, i + p.bytes().len()) /// ``` /// /// Namely, among the patterns in a collection, if they are matched in /// the order provided by this iterator, then the result is guaranteed /// to satisfy the correct match semantics. (Either leftmost-first or /// leftmost-longest.) pub(crate) fn iter(&self) -> PatternIter<'_> { PatternIter { patterns: self, i: 0 } } } /// An iterator over the patterns in the `Patterns` collection. /// /// The order of the patterns provided by this iterator is consistent with the /// match semantics of the originating collection of patterns. /// /// The lifetime `'p` corresponds to the lifetime of the collection of patterns /// this is iterating over. #[derive(Debug)] pub(crate) struct PatternIter<'p> { patterns: &'p Patterns, i: usize, } impl<'p> Iterator for PatternIter<'p> { type Item = (PatternID, Pattern<'p>); fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> { if self.i >= self.patterns.len() { return None; } let id = self.patterns.order[self.i]; let p = self.patterns.get(id); self.i += 1; Some((id, p)) } } /// A pattern that is used in packed searching. #[derive(Clone)] pub(crate) struct Pattern<'a>(&'a [u8]); impl<'a> fmt::Debug for Pattern<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Pattern") .field("lit", &String::from_utf8_lossy(&self.0)) .finish() } } impl<'p> Pattern<'p> { /// Returns the length of this pattern, in bytes. pub(crate) fn len(&self) -> usize { self.0.len() } /// Returns the bytes of this pattern. pub(crate) fn bytes(&self) -> &[u8] { &self.0 } /// Returns the first `len` low nybbles from this pattern. If this pattern /// is shorter than `len`, then this panics. pub(crate) fn low_nybbles(&self, len: usize) -> Box<[u8]> { let mut nybs = vec![0; len].into_boxed_slice(); for (i, byte) in self.bytes().iter().take(len).enumerate() { nybs[i] = byte & 0xF; } nybs } /// Returns true if this pattern is a prefix of the given bytes. #[inline(always)] pub(crate) fn is_prefix(&self, bytes: &[u8]) -> bool { is_prefix(bytes, self.bytes()) } /// Returns true if this pattern is a prefix of the haystack given by the /// raw `start` and `end` pointers. /// /// # Safety /// /// * It must be the case that `start < end` and that the distance between /// them is at least equal to `V::BYTES`. That is, it must always be valid /// to do at least an unaligned load of `V` at `start`. /// * Both `start` and `end` must be valid for reads. /// * Both `start` and `end` must point to an initialized value. /// * Both `start` and `end` must point to the same allocated object and /// must either be in bounds or at most one byte past the end of the /// allocated object. /// * Both `start` and `end` must be _derived from_ a pointer to the same /// object. /// * The distance between `start` and `end` must not overflow `isize`. /// * The distance being in bounds must not rely on "wrapping around" the /// address space. #[inline(always)] pub(crate) unsafe fn is_prefix_raw( &self, start: *const u8, end: *const u8, ) -> bool { let patlen = self.bytes().len(); let haylen = end.distance(start); if patlen > haylen { return false; } // SAFETY: We've checked that the haystack has length at least equal // to this pattern. All other safety concerns are the responsibility // of the caller. is_equal_raw(start, self.bytes().as_ptr(), patlen) } } /// Returns true if and only if `needle` is a prefix of `haystack`. /// /// This uses a latency optimized variant of `memcmp` internally which *might* /// make this faster for very short strings. /// /// # Inlining /// /// This routine is marked `inline(always)`. If you want to call this function /// in a way that is not always inlined, you'll need to wrap a call to it in /// another function that is marked as `inline(never)` or just `inline`. #[inline(always)] fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool { if needle.len() > haystack.len() { return false; } // SAFETY: Our pointers are derived directly from borrowed slices which // uphold all of our safety guarantees except for length. We account for // length with the check above. unsafe { is_equal_raw(haystack.as_ptr(), needle.as_ptr(), needle.len()) } } /// Compare corresponding bytes in `x` and `y` for equality. /// /// That is, this returns true if and only if `x.len() == y.len()` and /// `x[i] == y[i]` for all `0 <= i < x.len()`. /// /// Note that this isn't used. We only use it in tests as a convenient way /// of testing `is_equal_raw`. /// /// # Inlining /// /// This routine is marked `inline(always)`. If you want to call this function /// in a way that is not always inlined, you'll need to wrap a call to it in /// another function that is marked as `inline(never)` or just `inline`. /// /// # Motivation /// /// Why not use slice equality instead? Well, slice equality usually results in /// a call out to the current platform's `libc` which might not be inlineable /// or have other overhead. This routine isn't guaranteed to be a win, but it /// might be in some cases. #[cfg(test)] #[inline(always)] fn is_equal(x: &[u8], y: &[u8]) -> bool { if x.len() != y.len() { return false; } // SAFETY: Our pointers are derived directly from borrowed slices which // uphold all of our safety guarantees except for length. We account for // length with the check above. unsafe { is_equal_raw(x.as_ptr(), y.as_ptr(), x.len()) } } /// Compare `n` bytes at the given pointers for equality. /// /// This returns true if and only if `*x.add(i) == *y.add(i)` for all /// `0 <= i < n`. /// /// # Inlining /// /// This routine is marked `inline(always)`. If you want to call this function /// in a way that is not always inlined, you'll need to wrap a call to it in /// another function that is marked as `inline(never)` or just `inline`. /// /// # Motivation /// /// Why not use slice equality instead? Well, slice equality usually results in /// a call out to the current platform's `libc` which might not be inlineable /// or have other overhead. This routine isn't guaranteed to be a win, but it /// might be in some cases. /// /// # Safety /// /// * Both `x` and `y` must be valid for reads of up to `n` bytes. /// * Both `x` and `y` must point to an initialized value. /// * Both `x` and `y` must each point to an allocated object and /// must either be in bounds or at most one byte past the end of the /// allocated object. `x` and `y` do not need to point to the same allocated /// object, but they may. /// * Both `x` and `y` must be _derived from_ a pointer to their respective /// allocated objects. /// * The distance between `x` and `x+n` must not overflow `isize`. Similarly /// for `y` and `y+n`. /// * The distance being in bounds must not rely on "wrapping around" the /// address space. #[inline(always)] unsafe fn is_equal_raw(mut x: *const u8, mut y: *const u8, n: usize) -> bool { // If we don't have enough bytes to do 4-byte at a time loads, then // handle each possible length specially. Note that I used to have a // byte-at-a-time loop here and that turned out to be quite a bit slower // for the memmem/pathological/defeat-simple-vector-alphabet benchmark. if n < 4 { return match n { 0 => true, 1 => x.read() == y.read(), 2 => { x.cast::().read_unaligned() == y.cast::().read_unaligned() } // I also tried copy_nonoverlapping here and it looks like the // codegen is the same. 3 => x.cast::<[u8; 3]>().read() == y.cast::<[u8; 3]>().read(), _ => unreachable!(), }; } // When we have 4 or more bytes to compare, then proceed in chunks of 4 at // a time using unaligned loads. // // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is // that this particular version of memcmp is likely to be called with tiny // needles. That means that if we do 8 byte loads, then a higher proportion // of memcmp calls will use the slower variant above. With that said, this // is a hypothesis and is only loosely supported by benchmarks. There's // likely some improvement that could be made here. The main thing here // though is to optimize for latency, not throughput. // SAFETY: The caller is responsible for ensuring the pointers we get are // valid and readable for at least `n` bytes. We also do unaligned loads, // so there's no need to ensure we're aligned. (This is justified by this // routine being specifically for short strings.) let xend = x.add(n.wrapping_sub(4)); let yend = y.add(n.wrapping_sub(4)); while x < xend { let vx = x.cast::().read_unaligned(); let vy = y.cast::().read_unaligned(); if vx != vy { return false; } x = x.add(4); y = y.add(4); } let vx = xend.cast::().read_unaligned(); let vy = yend.cast::().read_unaligned(); vx == vy } #[cfg(test)] mod tests { use super::*; #[test] fn equals_different_lengths() { assert!(!is_equal(b"", b"a")); assert!(!is_equal(b"a", b"")); assert!(!is_equal(b"ab", b"a")); assert!(!is_equal(b"a", b"ab")); } #[test] fn equals_mismatch() { let one_mismatch = [ (&b"a"[..], &b"x"[..]), (&b"ab"[..], &b"ax"[..]), (&b"abc"[..], &b"abx"[..]), (&b"abcd"[..], &b"abcx"[..]), (&b"abcde"[..], &b"abcdx"[..]), (&b"abcdef"[..], &b"abcdex"[..]), (&b"abcdefg"[..], &b"abcdefx"[..]), (&b"abcdefgh"[..], &b"abcdefgx"[..]), (&b"abcdefghi"[..], &b"abcdefghx"[..]), (&b"abcdefghij"[..], &b"abcdefghix"[..]), (&b"abcdefghijk"[..], &b"abcdefghijx"[..]), (&b"abcdefghijkl"[..], &b"abcdefghijkx"[..]), (&b"abcdefghijklm"[..], &b"abcdefghijklx"[..]), (&b"abcdefghijklmn"[..], &b"abcdefghijklmx"[..]), ]; for (x, y) in one_mismatch { assert_eq!(x.len(), y.len(), "lengths should match"); assert!(!is_equal(x, y)); assert!(!is_equal(y, x)); } } #[test] fn equals_yes() { assert!(is_equal(b"", b"")); assert!(is_equal(b"a", b"a")); assert!(is_equal(b"ab", b"ab")); assert!(is_equal(b"abc", b"abc")); assert!(is_equal(b"abcd", b"abcd")); assert!(is_equal(b"abcde", b"abcde")); assert!(is_equal(b"abcdef", b"abcdef")); assert!(is_equal(b"abcdefg", b"abcdefg")); assert!(is_equal(b"abcdefgh", b"abcdefgh")); assert!(is_equal(b"abcdefghi", b"abcdefghi")); } #[test] fn prefix() { assert!(is_prefix(b"", b"")); assert!(is_prefix(b"a", b"")); assert!(is_prefix(b"ab", b"")); assert!(is_prefix(b"foo", b"foo")); assert!(is_prefix(b"foobar", b"foo")); assert!(!is_prefix(b"foo", b"fob")); assert!(!is_prefix(b"foobar", b"fob")); } } aho-corasick-1.1.3/src/packed/rabinkarp.rs000064400000000000000000000144501046102023000165320ustar 00000000000000use alloc::{sync::Arc, vec, vec::Vec}; use crate::{packed::pattern::Patterns, util::search::Match, PatternID}; /// The type of the rolling hash used in the Rabin-Karp algorithm. type Hash = usize; /// The number of buckets to store our patterns in. We don't want this to be /// too big in order to avoid wasting memory, but we don't want it to be too /// small either to avoid spending too much time confirming literals. /// /// The number of buckets MUST be a power of two. Otherwise, determining the /// bucket from a hash will slow down the code considerably. Using a power /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` /// instruction. const NUM_BUCKETS: usize = 64; /// An implementation of the Rabin-Karp algorithm. The main idea of this /// algorithm is to maintain a rolling hash as it moves through the input, and /// then check whether that hash corresponds to the same hash for any of the /// patterns we're looking for. /// /// A draw back of naively scaling Rabin-Karp to multiple patterns is that /// it requires all of the patterns to be the same length, which in turn /// corresponds to the number of bytes to hash. We adapt this to work for /// multiple patterns of varying size by fixing the number of bytes to hash /// to be the length of the smallest pattern. We also split the patterns into /// several buckets to hopefully make the confirmation step faster. /// /// Wikipedia has a decent explanation, if a bit heavy on the theory: /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm /// /// But ESMAJ provides something a bit more concrete: /// https://www-igm.univ-mlv.fr/~lecroq/string/node5.html #[derive(Clone, Debug)] pub(crate) struct RabinKarp { /// The patterns we're searching for. patterns: Arc, /// The order of patterns in each bucket is significant. Namely, they are /// arranged such that the first one to match is the correct match. This /// may not necessarily correspond to the order provided by the caller. /// For example, if leftmost-longest semantics are used, then the patterns /// are sorted by their length in descending order. If leftmost-first /// semantics are used, then the patterns are sorted by their pattern ID /// in ascending order (which corresponds to the caller's order). buckets: Vec>, /// The length of the hashing window. Generally, this corresponds to the /// length of the smallest pattern. hash_len: usize, /// The factor to subtract out of a hash before updating it with a new /// byte. hash_2pow: usize, } impl RabinKarp { /// Compile a new Rabin-Karp matcher from the patterns given. /// /// This panics if any of the patterns in the collection are empty, or if /// the collection is itself empty. pub(crate) fn new(patterns: &Arc) -> RabinKarp { assert!(patterns.len() >= 1); let hash_len = patterns.minimum_len(); assert!(hash_len >= 1); let mut hash_2pow = 1usize; for _ in 1..hash_len { hash_2pow = hash_2pow.wrapping_shl(1); } let mut rk = RabinKarp { patterns: Arc::clone(patterns), buckets: vec![vec![]; NUM_BUCKETS], hash_len, hash_2pow, }; for (id, pat) in patterns.iter() { let hash = rk.hash(&pat.bytes()[..rk.hash_len]); let bucket = hash % NUM_BUCKETS; rk.buckets[bucket].push((hash, id)); } rk } /// Return the first matching pattern in the given haystack, begining the /// search at `at`. pub(crate) fn find_at( &self, haystack: &[u8], mut at: usize, ) -> Option { assert_eq!(NUM_BUCKETS, self.buckets.len()); if at + self.hash_len > haystack.len() { return None; } let mut hash = self.hash(&haystack[at..at + self.hash_len]); loop { let bucket = &self.buckets[hash % NUM_BUCKETS]; for &(phash, pid) in bucket { if phash == hash { if let Some(c) = self.verify(pid, haystack, at) { return Some(c); } } } if at + self.hash_len >= haystack.len() { return None; } hash = self.update_hash( hash, haystack[at], haystack[at + self.hash_len], ); at += 1; } } /// Returns the approximate total amount of heap used by this searcher, in /// units of bytes. pub(crate) fn memory_usage(&self) -> usize { self.buckets.len() * core::mem::size_of::>() + self.patterns.len() * core::mem::size_of::<(Hash, PatternID)>() } /// Verify whether the pattern with the given id matches at /// `haystack[at..]`. /// /// We tag this function as `cold` because it helps improve codegen. /// Intuitively, it would seem like inlining it would be better. However, /// the only time this is called and a match is not found is when there /// there is a hash collision, or when a prefix of a pattern matches but /// the entire pattern doesn't match. This is hopefully fairly rare, and /// if it does occur a lot, it's going to be slow no matter what we do. #[cold] fn verify( &self, id: PatternID, haystack: &[u8], at: usize, ) -> Option { let pat = self.patterns.get(id); if pat.is_prefix(&haystack[at..]) { Some(Match::new(id, at..at + pat.len())) } else { None } } /// Hash the given bytes. fn hash(&self, bytes: &[u8]) -> Hash { assert_eq!(self.hash_len, bytes.len()); let mut hash = 0usize; for &b in bytes { hash = hash.wrapping_shl(1).wrapping_add(b as usize); } hash } /// Update the hash given based on removing `old_byte` at the beginning /// of some byte string, and appending `new_byte` to the end of that same /// byte string. fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) .wrapping_shl(1) .wrapping_add(new_byte as usize) } } aho-corasick-1.1.3/src/packed/teddy/README.md000064400000000000000000000453331046102023000166070ustar 00000000000000Teddy is a SIMD accelerated multiple substring matching algorithm. The name and the core ideas in the algorithm were learned from the [Hyperscan][1_u] project. The implementation in this repository was mostly motivated for use in accelerating regex searches by searching for small sets of required literals extracted from the regex. # Background The key idea of Teddy is to do *packed* substring matching. In the literature, packed substring matching is the idea of examining multiple bytes in a haystack at a time to detect matches. Implementations of, for example, memchr (which detects matches of a single byte) have been doing this for years. Only recently, with the introduction of various SIMD instructions, has this been extended to substring matching. The PCMPESTRI instruction (and its relatives), for example, implements substring matching in hardware. It is, however, limited to substrings of length 16 bytes or fewer, but this restriction is fine in a regex engine, since we rarely care about the performance difference between searching for a 16 byte literal and a 16 + N literal; 16 is already long enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs at least, is its latency and throughput. As a result, it is often faster to do substring search with a Boyer-Moore (or Two-Way) variant and a well placed memchr to quickly skip through the haystack. There are fewer results from the literature on packed substring matching, and even fewer for packed multiple substring matching. Ben-Kiki et al. [2] describes use of PCMPESTRI for substring matching, but is mostly theoretical and hand-waves performance. There is other theoretical work done by Bille [3] as well. The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci and is generally focused on multiple pattern search. Their first paper [4a] introduces the concept of a fingerprint, which is computed for every block of N bytes in every pattern. The haystack is then scanned N bytes at a time and a fingerprint is computed in the same way it was computed for blocks in the patterns. If the fingerprint corresponds to one that was found in a pattern, then a verification step follows to confirm that one of the substrings with the corresponding fingerprint actually matches at the current location. Various implementation tricks are employed to make sure the fingerprint lookup is fast; typically by truncating the fingerprint. (This may, of course, provoke more steps in the verification process, so a balance must be struck.) The main downside of [4a] is that the minimum substring length is 32 bytes, presumably because of how the algorithm uses certain SIMD instructions. This essentially makes it useless for general purpose regex matching, where a small number of short patterns is far more likely. Faro and Kulekci published another paper [4b] that is conceptually very similar to [4a]. The key difference is that it uses the CRC32 instruction (introduced as part of SSE 4.2) to compute fingerprint values. This also enables the algorithm to work effectively on substrings as short as 7 bytes with 4 byte windows. 7 bytes is unfortunately still too long. The window could be technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the small window size ends up negating most performance benefits—and it's likely the common case in a general purpose regex engine. Faro and Kulekci also published [4c] that appears to be intended as a replacement to using PCMPESTRI. In particular, it is specifically motivated by the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD instructions that are faster. While this approach works for short substrings, I personally couldn't see a way to generalize it to multiple substring search. Faro and Kulekci have another paper [4d] that I haven't been able to read because it is behind a paywall. # Teddy Finally, we get to Teddy. If the above literature review is complete, then it appears that Teddy is a novel algorithm. More than that, in my experience, it completely blows away the competition for short substrings, which is exactly what we want in a general purpose regex engine. Again, the algorithm appears to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced late 2015, and no earlier history could be found. Therefore, tracking the exact provenance of the algorithm with respect to the published literature seems difficult. At a high level, Teddy works somewhat similarly to the fingerprint algorithms published by Faro and Kulekci, but Teddy does it in a way that scales a bit better. Namely: 1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX) byte chunks. 16 (or 32) is significant because it corresponds to the number of bytes in a SIMD vector. 2. Bitwise operations are performed on each chunk to discover if any region of it matches a set of precomputed fingerprints from the patterns. If there are matches, then a verification step is performed. In this implementation, our verification step is naive. This can be improved upon. The details to make this work are quite clever. First, we must choose how to pick our fingerprints. In Hyperscan's implementation, I *believe* they use the last N bytes of each substring, where N must be at least the minimum length of any substring in the set being searched. In this implementation, we use the first N bytes of each substring. (The tradeoffs between these choices aren't yet clear to me.) We then must figure out how to quickly test whether an occurrence of any fingerprint from the set of patterns appears in a 16 byte block from the haystack. To keep things simple, let's assume N = 1 and examine some examples to motivate the approach. Here are our patterns: ```ignore foo bar baz ``` The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set our 16 byte block to: ```ignore bat cat foo bump xxxxxxxxxxxxxxxx ``` To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates a mask that allows us to quickly compute membership of a fingerprint in a 16 byte block that also tells which pattern the fingerprint corresponds to. In this case, our fingerprint is a single byte, so an appropriate abstraction is a map from a single byte to a list of patterns that contain that fingerprint: ```ignore f |--> foo b |--> bar, baz ``` Now, all we need to do is figure out how to represent this map in vector space and use normal SIMD operations to perform a lookup. The first simplification we can make is to represent our patterns as bit fields occupying a single byte. This is important, because a single SIMD vector can store 16 bytes. ```ignore f |--> 00000001 b |--> 00000010, 00000100 ``` How do we perform lookup though? It turns out that SSSE3 introduced a very cool instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`, and returns a third vector `C`. All vectors are treated as 16 8-bit integers. `C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true for the purposes of this algorithm. For full details, see [Intel's Intrinsics Guide][5_u].) This essentially lets us use the values in `B` to lookup values in `A`. If we could somehow cause `B` to contain our 16 byte block from the haystack, and if `A` could contain our bitmasks, then we'd end up with something like this for `A`: ```ignore 0x00 0x01 ... 0x62 ... 0x66 ... 0xFF A = 0 0 00000110 00000001 0 ``` And if `B` contains our window from our haystack, we could use shuffle to take the values from `B` and use them to look up our bitsets in `A`. But of course, we can't do this because `A` in the above example contains 256 bytes, which is much larger than the size of a SIMD vector. Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of our bitsets, we can use two masks, where one mask corresponds to the lower four bits of our fingerprint and the other mask corresponds to the upper four bits. So our map now looks like: ```ignore 'f' & 0xF = 0x6 |--> 00000001 'f' >> 4 = 0x6 |--> 00000111 'b' & 0xF = 0x2 |--> 00000110 'b' >> 4 = 0x6 |--> 00000111 ``` Notice that the bitsets for each nybble correspond to the union of all fingerprints that contain that nybble. For example, both `f` and `b` have the same upper 4 bits but differ on the lower 4 bits. Putting this together, we have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is our mask for the upper nybble and `B` is our 16 byte block from the haystack: ```ignore 0x00 0x01 0x02 0x03 ... 0x06 ... 0xF A0 = 0 0 00000110 0 00000001 0 A1 = 0 0 0 0 00000111 0 B = b a t _ t p B = 0x62 0x61 0x74 0x20 0x74 0x70 ``` But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits, and we need indexes that are at most 4 bits (corresponding to one of 16 values). We can apply the same transformation to split `B` into lower and upper nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and `B1` corresponds to the upper nybbles: ```ignore b a t _ c a t _ f o o _ b u m p B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0 B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7 ``` And now we have a nice correspondence. `B0` can index `A0` and `B1` can index `A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`: ```ignore b a ... f o ... p A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0] C0 = 00000110 0 00000001 0 0 ``` And `C1 = PSHUFB(A1, B1)`: ```ignore b a ... f o ... p A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7] C1 = 00000111 00000111 00000111 00000111 0 ``` Notice how neither one of `C0` or `C1` is guaranteed to report fully correct results all on its own. For example, `C1` claims that `b` is a fingerprint for the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint for all of our patterns. But if we combined `C0` and `C1` with an `AND` operation: ```ignore b a ... f o ... p C = 00000110 0 00000001 0 0 ``` Then we now have that `C[i]` contains a bitset corresponding to the matching fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that block. Once we have that, we can look for the position of the least significant bit in `C`. (Least significant because we only target little endian here. Thus, the least significant bytes correspond to bytes in our haystack at a lower address.) That position, modulo `8`, gives us the pattern that the fingerprint matches. That position, integer divided by `8`, also gives us the byte offset that the fingerprint occurs in inside the 16 byte haystack block. Using those two pieces of information, we can run a verification procedure that tries to match all substrings containing that fingerprint at that position in the haystack. # Implementation notes The problem with the algorithm as described above is that it uses a single byte for a fingerprint. This will work well if the fingerprints are rare in the haystack (e.g., capital letters or special characters in normal English text), but if the fingerprints are common, you'll wind up spending too much time in the verification step, which effectively negates the performance benefits of scanning 16 bytes at a time. Remember, the key to the performance of this algorithm is to do as little work as possible per 16 (or 32) bytes. This algorithm can be extrapolated in a relatively straight-forward way to use larger fingerprints. That is, instead of a single byte prefix, we might use a two or three byte prefix. The implementation here implements N = {1, 2, 3} and always picks the largest N possible. The rationale is that the bigger the fingerprint, the fewer verification steps we'll do. Of course, if N is too large, then we'll end up doing too much on each step. The way to extend it is: 1. Add a mask for each byte in the fingerprint. (Remember that each mask is composed of two SIMD vectors.) This results in a value of `C` for each byte in the fingerprint while searching. 2. When testing each 16 (or 32) byte block, each value of `C` must be shifted so that they are aligned. Once aligned, they should all be `AND`'d together. This will give you only the bitsets corresponding to the full match of the fingerprint. To do this, one needs to save the last byte (for N=2) or last two bytes (for N=3) from the previous iteration, and then line them up with the first one or two bytes of the next iteration. ## Verification Verification generally follows the procedure outlined above. The tricky parts are in the right formulation of operations to get our bits out of our vectors. We have a limited set of operations available to us on SIMD vectors as 128-bit or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers from our vectors, and then run our verification step on each of those. The verification step looks at the least significant bit set, and from its position, we can derive the byte offset and bucket. (Again, as described above.) Once we know the bucket, we do a fairly naive exhaustive search for every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash table, but I haven't had time to thoroughly explore that. A few initial half-hearted attempts resulted in worse performance.) ## AVX The AVX version of Teddy extrapolates almost perfectly from the SSE version. The only hickup is that PALIGNR is used to align chunks in the 16-bit version, and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it only works within 128-bit lanes. So there's a bit of tomfoolery to get around this by shuffling the vectors before calling VPALIGNR. The only other aspect to AVX is that since our masks are still fundamentally 16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to 32-byte chunks. ## Fat Teddy In the version of Teddy described above, 8 buckets are used to group patterns that we want to search for. However, when AVX is available, we can extend the number of buckets to 16 by permitting each byte in our masks to use 16-bits instead of 8-bits to represent the buckets it belongs to. (This variant is also in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a time, even though we're using AVX. Instead, we have to scan 16 bytes at a time. What we gain, though, is (hopefully) less work in our verification routine. It patterns are more spread out across more buckets, then there should overall be fewer false positives. In general, Fat Teddy permits us to grow our capacity a bit and search for more literals before Teddy gets overwhelmed. The tricky part of Fat Teddy is in how we adjust our masks and our verification procedure. For the masks, we simply represent the first 8 buckets in each of the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes. Then, in the search loop, instead of loading 32 bytes from the haystack, we load the same 16 bytes from the haystack into both the low and high 16 byte portions of our 256-bit vector. So for example, a mask might look like this: bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000 byte: 31 30 16 15 14 0 offset: 15 14 0 15 14 0 buckets: 8-15 8-15 8-15 0-7 0-7 0-7 Where `byte` is the position in the vector (higher numbers corresponding to more significant bits), `offset` is the corresponding position in the haystack chunk, and `buckets` corresponds to the bucket assignments for that particular byte. In particular, notice that the bucket assignments for offset `0` are spread out between bytes `0` and `16`. This works well for the chunk-by-chunk search procedure, but verification really wants to process all bucket assignments for each offset at once. Otherwise, we might wind up finding a match at offset `1` in one the first 8 buckets, when we really should have reported a match at offset `0` in one of the second 8 buckets. (Because we want the leftmost match.) Thus, for verification, we rearrange the above vector such that it is a sequence of 16-bit integers, where the least significant 16-bit integer corresponds to all of the bucket assignments for offset `0`. So with the above vector, the least significant 16-bit integer would be 11000000 000000 which was taken from bytes `16` and `0`. Then the verification step pretty much runs as described, except with 16 buckets instead of 8. # References - **[1]** [Hyperscan on GitHub](https://github.com/intel/hyperscan), [webpage](https://www.hyperscan.io/) - **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R., & Weimann, O. (2011). _Optimal packed string matching_. In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13). Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik. DOI: 10.4230/LIPIcs.FSTTCS.2011.423. [PDF](https://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf). - **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R., & Weimann, O. (2014). _Towards optimal packed string matching_. Theoretical Computer Science, 525, 111-129. DOI: 10.1016/j.tcs.2013.06.013. [PDF](https://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf). - **[3]** Bille, P. (2011). _Fast searching in packed strings_. Journal of Discrete Algorithms, 9(1), 49-56. DOI: 10.1016/j.jda.2010.09.003. [PDF](https://www.sciencedirect.com/science/article/pii/S1570866710000353). - **[4a]** Faro, S., & Külekci, M. O. (2012, October). _Fast multiple string matching using streaming SIMD extensions technology_. In String Processing and Information Retrieval (pp. 217-228). Springer Berlin Heidelberg. DOI: 10.1007/978-3-642-34109-0_23. [PDF](https://www.dmi.unict.it/faro/papers/conference/faro32.pdf). - **[4b]** Faro, S., & Külekci, M. O. (2013, September). _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_. In Stringology (pp. 78-91). [PDF](https://www.dmi.unict.it/faro/papers/conference/faro36.pdf). - **[4c]** Faro, S., & Külekci, M. O. (2013, January). _Fast packed string matching for short patterns_. In Proceedings of the Meeting on Algorithm Engineering & Expermiments (pp. 113-121). Society for Industrial and Applied Mathematics. [PDF](https://arxiv.org/pdf/1209.6449.pdf). - **[4d]** Faro, S., & Külekci, M. O. (2014). _Fast and flexible packed string matching_. Journal of Discrete Algorithms, 28, 61-72. DOI: 10.1016/j.jda.2014.07.003. [1_u]: https://github.com/intel/hyperscan [5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide aho-corasick-1.1.3/src/packed/teddy/builder.rs000064400000000000000000000745401046102023000173260ustar 00000000000000use core::{ fmt::Debug, panic::{RefUnwindSafe, UnwindSafe}, }; use alloc::sync::Arc; use crate::packed::{ext::Pointer, pattern::Patterns, teddy::generic::Match}; /// A builder for constructing a Teddy matcher. /// /// The builder primarily permits fine grained configuration of the Teddy /// matcher. Most options are made only available for testing/benchmarking /// purposes. In reality, options are automatically determined by the nature /// and number of patterns given to the builder. #[derive(Clone, Debug)] pub(crate) struct Builder { /// When none, this is automatically determined. Otherwise, `false` means /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't /// available and Fat Teddy was requested, no matcher will be built. only_fat: Option, /// When none, this is automatically determined. Otherwise, `false` means /// that 128-bit vectors will be used (up to SSSE3 instructions) where as /// `true` means that 256-bit vectors will be used. As with `fat`, if /// 256-bit vectors are requested and they aren't available, then a /// searcher will not be built. only_256bit: Option, /// When true (the default), the number of patterns will be used as a /// heuristic for refusing construction of a Teddy searcher. The point here /// is that too many patterns can overwhelm Teddy. But this can be disabled /// in cases where the caller knows better. heuristic_pattern_limits: bool, } impl Default for Builder { fn default() -> Builder { Builder::new() } } impl Builder { /// Create a new builder for configuring a Teddy matcher. pub(crate) fn new() -> Builder { Builder { only_fat: None, only_256bit: None, heuristic_pattern_limits: true, } } /// Build a matcher for the set of patterns given. If a matcher could not /// be built, then `None` is returned. /// /// Generally, a matcher isn't built if the necessary CPU features aren't /// available, an unsupported target or if the searcher is believed to be /// slower than standard techniques (i.e., if there are too many literals). pub(crate) fn build(&self, patterns: Arc) -> Option { self.build_imp(patterns) } /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful /// for a larger set of literals. /// /// `None` is the default, which results in an automatic selection based /// on the number of literals and available CPU features. pub(crate) fn only_fat(&mut self, yes: Option) -> &mut Builder { self.only_fat = yes; self } /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). /// Generally, a larger vector size is better since it either permits /// matching more patterns or matching more bytes in the haystack at once. /// /// `None` is the default, which results in an automatic selection based on /// the number of literals and available CPU features. pub(crate) fn only_256bit(&mut self, yes: Option) -> &mut Builder { self.only_256bit = yes; self } /// Request that heuristic limitations on the number of patterns be /// employed. This useful to disable for benchmarking where one wants to /// explore how Teddy performs on large number of patterns even if the /// heuristics would otherwise refuse construction. /// /// This is enabled by default. pub(crate) fn heuristic_pattern_limits( &mut self, yes: bool, ) -> &mut Builder { self.heuristic_pattern_limits = yes; self } fn build_imp(&self, patterns: Arc) -> Option { let patlimit = self.heuristic_pattern_limits; // There's no particular reason why we limit ourselves to little endian // here, but it seems likely that some parts of Teddy as they are // currently written (e.g., the uses of `trailing_zeros`) are likely // wrong on non-little-endian targets. Such things are likely easy to // fix, but at the time of writing (2023/09/18), I actually do not know // how to test this code on a big-endian target. So for now, we're // conservative and just bail out. if !cfg!(target_endian = "little") { debug!("skipping Teddy because target isn't little endian"); return None; } // Too many patterns will overwhelm Teddy and likely lead to slow // downs, typically in the verification step. if patlimit && patterns.len() > 64 { debug!("skipping Teddy because of too many patterns"); return None; } #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] { use self::x86_64::{FatAVX2, SlimAVX2, SlimSSSE3}; let mask_len = core::cmp::min(4, patterns.minimum_len()); let beefy = patterns.len() > 32; let has_avx2 = self::x86_64::is_available_avx2(); let has_ssse3 = has_avx2 || self::x86_64::is_available_ssse3(); let use_avx2 = if self.only_256bit == Some(true) { if !has_avx2 { debug!( "skipping Teddy because avx2 was demanded but unavailable" ); return None; } true } else if self.only_256bit == Some(false) { if !has_ssse3 { debug!( "skipping Teddy because ssse3 was demanded but unavailable" ); return None; } false } else if !has_ssse3 && !has_avx2 { debug!( "skipping Teddy because ssse3 and avx2 are unavailable" ); return None; } else { has_avx2 }; let fat = match self.only_fat { None => use_avx2 && beefy, Some(false) => false, Some(true) if !use_avx2 => { debug!( "skipping Teddy because fat was demanded, but fat \ Teddy requires avx2 which is unavailable" ); return None; } Some(true) => true, }; // Just like for aarch64, it's possible that too many patterns will // overhwelm Teddy. Unlike aarch64 though, we have Fat teddy which // helps things scale a bit more by spreading patterns over more // buckets. // // These thresholds were determined by looking at the measurements // for the rust/aho-corasick/packed/leftmost-first and // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` // benchmarks. if patlimit && mask_len == 1 && patterns.len() > 16 { debug!( "skipping Teddy (mask len: 1) because there are \ too many patterns", ); return None; } match (mask_len, use_avx2, fat) { (1, false, _) => { debug!("Teddy choice: 128-bit slim, 1 byte"); SlimSSSE3::<1>::new(&patterns) } (1, true, false) => { debug!("Teddy choice: 256-bit slim, 1 byte"); SlimAVX2::<1>::new(&patterns) } (1, true, true) => { debug!("Teddy choice: 256-bit fat, 1 byte"); FatAVX2::<1>::new(&patterns) } (2, false, _) => { debug!("Teddy choice: 128-bit slim, 2 bytes"); SlimSSSE3::<2>::new(&patterns) } (2, true, false) => { debug!("Teddy choice: 256-bit slim, 2 bytes"); SlimAVX2::<2>::new(&patterns) } (2, true, true) => { debug!("Teddy choice: 256-bit fat, 2 bytes"); FatAVX2::<2>::new(&patterns) } (3, false, _) => { debug!("Teddy choice: 128-bit slim, 3 bytes"); SlimSSSE3::<3>::new(&patterns) } (3, true, false) => { debug!("Teddy choice: 256-bit slim, 3 bytes"); SlimAVX2::<3>::new(&patterns) } (3, true, true) => { debug!("Teddy choice: 256-bit fat, 3 bytes"); FatAVX2::<3>::new(&patterns) } (4, false, _) => { debug!("Teddy choice: 128-bit slim, 4 bytes"); SlimSSSE3::<4>::new(&patterns) } (4, true, false) => { debug!("Teddy choice: 256-bit slim, 4 bytes"); SlimAVX2::<4>::new(&patterns) } (4, true, true) => { debug!("Teddy choice: 256-bit fat, 4 bytes"); FatAVX2::<4>::new(&patterns) } _ => { debug!("no supported Teddy configuration found"); None } } } #[cfg(all( target_arch = "aarch64", target_feature = "neon", target_endian = "little" ))] { use self::aarch64::SlimNeon; let mask_len = core::cmp::min(4, patterns.minimum_len()); if self.only_256bit == Some(true) { debug!( "skipping Teddy because 256-bits were demanded \ but unavailable" ); return None; } if self.only_fat == Some(true) { debug!( "skipping Teddy because fat was demanded but unavailable" ); } // Since we don't have Fat teddy in aarch64 (I think we'd want at // least 256-bit vectors for that), we need to be careful not to // allow too many patterns as it might overwhelm Teddy. Generally // speaking, as the mask length goes up, the more patterns we can // handle because the mask length results in fewer candidates // generated. // // These thresholds were determined by looking at the measurements // for the rust/aho-corasick/packed/leftmost-first and // rust/aho-corasick/dfa/leftmost-first engines on the `teddy/` // benchmarks. match mask_len { 1 => { if patlimit && patterns.len() > 16 { debug!( "skipping Teddy (mask len: 1) because there are \ too many patterns", ); } debug!("Teddy choice: 128-bit slim, 1 byte"); SlimNeon::<1>::new(&patterns) } 2 => { if patlimit && patterns.len() > 32 { debug!( "skipping Teddy (mask len: 2) because there are \ too many patterns", ); } debug!("Teddy choice: 128-bit slim, 2 bytes"); SlimNeon::<2>::new(&patterns) } 3 => { if patlimit && patterns.len() > 48 { debug!( "skipping Teddy (mask len: 3) because there are \ too many patterns", ); } debug!("Teddy choice: 128-bit slim, 3 bytes"); SlimNeon::<3>::new(&patterns) } 4 => { debug!("Teddy choice: 128-bit slim, 4 bytes"); SlimNeon::<4>::new(&patterns) } _ => { debug!("no supported Teddy configuration found"); None } } } #[cfg(not(any( all(target_arch = "x86_64", target_feature = "sse2"), all( target_arch = "aarch64", target_feature = "neon", target_endian = "little" ) )))] { None } } } /// A searcher that dispatches to one of several possible Teddy variants. #[derive(Clone, Debug)] pub(crate) struct Searcher { /// The Teddy variant we use. We use dynamic dispatch under the theory that /// it results in better codegen then a enum, although this is a specious /// claim. /// /// This `Searcher` is essentially a wrapper for a `SearcherT` trait /// object. We just make `memory_usage` and `minimum_len` available without /// going through dynamic dispatch. imp: Arc, /// Total heap memory used by the Teddy variant. memory_usage: usize, /// The minimum haystack length this searcher can handle. It is intended /// for callers to use some other search routine (such as Rabin-Karp) in /// cases where the haystack (or remainer of the haystack) is too short. minimum_len: usize, } impl Searcher { /// Look for the leftmost occurrence of any pattern in this search in the /// given haystack starting at the given position. /// /// # Panics /// /// This panics when `haystack[at..].len()` is less than the minimum length /// for this haystack. #[inline(always)] pub(crate) fn find( &self, haystack: &[u8], at: usize, ) -> Option { // SAFETY: The Teddy implementations all require a minimum haystack // length, and this is required for safety. Therefore, we assert it // here in order to make this method sound. assert!(haystack[at..].len() >= self.minimum_len); let hayptr = haystack.as_ptr(); // SAFETY: Construction of the searcher guarantees that we are able // to run it in the current environment (i.e., we won't get an AVX2 // searcher on a x86-64 CPU without AVX2 support). Also, the pointers // are valid as they are derived directly from a borrowed slice. let teddym = unsafe { self.imp.find(hayptr.add(at), hayptr.add(haystack.len()))? }; let start = teddym.start().as_usize().wrapping_sub(hayptr.as_usize()); let end = teddym.end().as_usize().wrapping_sub(hayptr.as_usize()); let span = crate::Span { start, end }; // OK because we won't permit the construction of a searcher that // could report a pattern ID bigger than what can fit in the crate-wide // PatternID type. let pid = crate::PatternID::new_unchecked(teddym.pattern().as_usize()); let m = crate::Match::new(pid, span); Some(m) } /// Returns the approximate total amount of heap used by this type, in /// units of bytes. #[inline(always)] pub(crate) fn memory_usage(&self) -> usize { self.memory_usage } /// Returns the minimum length, in bytes, that a haystack must be in order /// to use it with this searcher. #[inline(always)] pub(crate) fn minimum_len(&self) -> usize { self.minimum_len } } /// A trait that provides dynamic dispatch over the different possible Teddy /// variants on the same algorithm. /// /// On `x86_64` for example, it isn't known until runtime which of 12 possible /// variants will be used. One might use one of the four slim 128-bit vector /// variants, or one of the four 256-bit vector variants or even one of the /// four fat 256-bit vector variants. /// /// Since this choice is generally made when the Teddy searcher is constructed /// and this choice is based on the patterns given and what the current CPU /// supports, it follows that there must be some kind of indirection at search /// time that "selects" the variant chosen at build time. /// /// There are a few different ways to go about this. One approach is to use an /// enum. It works fine, but in my experiments, this generally results in worse /// codegen. Another approach, which is what we use here, is dynamic dispatch /// via a trait object. We basically implement this trait for each possible /// variant, select the variant we want at build time and convert it to a /// trait object for use at search time. /// /// Another approach is to use function pointers and stick each of the possible /// variants into a union. This is essentially isomorphic to the dynamic /// dispatch approach, but doesn't require any allocations. Since this crate /// requires `alloc`, there's no real reason (AFAIK) to go down this path. (The /// `memchr` crate does this.) trait SearcherT: Debug + Send + Sync + UnwindSafe + RefUnwindSafe + 'static { /// Execute a search on the given haystack (identified by `start` and `end` /// raw pointers). /// /// # Safety /// /// Essentially, the `start` and `end` pointers must be valid and point /// to a haystack one can read. As long as you derive them from, for /// example, a `&[u8]`, they should automatically satisfy all of the safety /// obligations: /// /// * Both `start` and `end` must be valid for reads. /// * Both `start` and `end` must point to an initialized value. /// * Both `start` and `end` must point to the same allocated object and /// must either be in bounds or at most one byte past the end of the /// allocated object. /// * Both `start` and `end` must be _derived from_ a pointer to the same /// object. /// * The distance between `start` and `end` must not overflow `isize`. /// * The distance being in bounds must not rely on "wrapping around" the /// address space. /// * It must be the case that `start <= end`. /// * `end - start` must be greater than the minimum length for this /// searcher. /// /// Also, it is expected that implementations of this trait will tag this /// method with a `target_feature` attribute. Callers must ensure that /// they are executing this method in an environment where that attribute /// is valid. unsafe fn find(&self, start: *const u8, end: *const u8) -> Option; } #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] mod x86_64 { use core::arch::x86_64::{__m128i, __m256i}; use alloc::sync::Arc; use crate::packed::{ ext::Pointer, pattern::Patterns, teddy::generic::{self, Match}, }; use super::{Searcher, SearcherT}; #[derive(Clone, Debug)] pub(super) struct SlimSSSE3 { slim128: generic::Slim<__m128i, BYTES>, } // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. macro_rules! slim_ssse3 { ($len:expr) => { impl SlimSSSE3<$len> { /// Creates a new searcher using "slim" Teddy with 128-bit /// vectors. If SSSE3 is not available in the current /// environment, then this returns `None`. pub(super) fn new( patterns: &Arc, ) -> Option { if !is_available_ssse3() { return None; } Some(unsafe { SlimSSSE3::<$len>::new_unchecked(patterns) }) } /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors without checking whether SSSE3 is available or not. /// /// # Safety /// /// Callers must ensure that SSSE3 is available in the current /// environment. #[target_feature(enable = "ssse3")] unsafe fn new_unchecked(patterns: &Arc) -> Searcher { let slim128 = generic::Slim::<__m128i, $len>::new( Arc::clone(patterns), ); let memory_usage = slim128.memory_usage(); let minimum_len = slim128.minimum_len(); let imp = Arc::new(SlimSSSE3 { slim128 }); Searcher { imp, memory_usage, minimum_len } } } impl SearcherT for SlimSSSE3<$len> { #[target_feature(enable = "ssse3")] #[inline] unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { // SAFETY: All obligations except for `target_feature` are // passed to the caller. Our use of `target_feature` is // safe because construction of this type requires that the // requisite target features are available. self.slim128.find(start, end) } } }; } slim_ssse3!(1); slim_ssse3!(2); slim_ssse3!(3); slim_ssse3!(4); #[derive(Clone, Debug)] pub(super) struct SlimAVX2 { slim128: generic::Slim<__m128i, BYTES>, slim256: generic::Slim<__m256i, BYTES>, } // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. macro_rules! slim_avx2 { ($len:expr) => { impl SlimAVX2<$len> { /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors. If AVX2 is not available in the current /// environment, then this returns `None`. pub(super) fn new( patterns: &Arc, ) -> Option { if !is_available_avx2() { return None; } Some(unsafe { SlimAVX2::<$len>::new_unchecked(patterns) }) } /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors without checking whether AVX2 is available or not. /// /// # Safety /// /// Callers must ensure that AVX2 is available in the current /// environment. #[target_feature(enable = "avx2")] unsafe fn new_unchecked(patterns: &Arc) -> Searcher { let slim128 = generic::Slim::<__m128i, $len>::new( Arc::clone(&patterns), ); let slim256 = generic::Slim::<__m256i, $len>::new( Arc::clone(&patterns), ); let memory_usage = slim128.memory_usage() + slim256.memory_usage(); let minimum_len = slim128.minimum_len(); let imp = Arc::new(SlimAVX2 { slim128, slim256 }); Searcher { imp, memory_usage, minimum_len } } } impl SearcherT for SlimAVX2<$len> { #[target_feature(enable = "avx2")] #[inline] unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { // SAFETY: All obligations except for `target_feature` are // passed to the caller. Our use of `target_feature` is // safe because construction of this type requires that the // requisite target features are available. let len = end.distance(start); if len < self.slim256.minimum_len() { self.slim128.find(start, end) } else { self.slim256.find(start, end) } } } }; } slim_avx2!(1); slim_avx2!(2); slim_avx2!(3); slim_avx2!(4); #[derive(Clone, Debug)] pub(super) struct FatAVX2 { fat256: generic::Fat<__m256i, BYTES>, } // Defines SlimAVX2 wrapper functions for 1, 2, 3 and 4 bytes. macro_rules! fat_avx2 { ($len:expr) => { impl FatAVX2<$len> { /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors. If AVX2 is not available in the current /// environment, then this returns `None`. pub(super) fn new( patterns: &Arc, ) -> Option { if !is_available_avx2() { return None; } Some(unsafe { FatAVX2::<$len>::new_unchecked(patterns) }) } /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors without checking whether AVX2 is available or not. /// /// # Safety /// /// Callers must ensure that AVX2 is available in the current /// environment. #[target_feature(enable = "avx2")] unsafe fn new_unchecked(patterns: &Arc) -> Searcher { let fat256 = generic::Fat::<__m256i, $len>::new( Arc::clone(&patterns), ); let memory_usage = fat256.memory_usage(); let minimum_len = fat256.minimum_len(); let imp = Arc::new(FatAVX2 { fat256 }); Searcher { imp, memory_usage, minimum_len } } } impl SearcherT for FatAVX2<$len> { #[target_feature(enable = "avx2")] #[inline] unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { // SAFETY: All obligations except for `target_feature` are // passed to the caller. Our use of `target_feature` is // safe because construction of this type requires that the // requisite target features are available. self.fat256.find(start, end) } } }; } fat_avx2!(1); fat_avx2!(2); fat_avx2!(3); fat_avx2!(4); #[inline] pub(super) fn is_available_ssse3() -> bool { #[cfg(not(target_feature = "sse2"))] { false } #[cfg(target_feature = "sse2")] { #[cfg(target_feature = "ssse3")] { true } #[cfg(not(target_feature = "ssse3"))] { #[cfg(feature = "std")] { std::is_x86_feature_detected!("ssse3") } #[cfg(not(feature = "std"))] { false } } } } #[inline] pub(super) fn is_available_avx2() -> bool { #[cfg(not(target_feature = "sse2"))] { false } #[cfg(target_feature = "sse2")] { #[cfg(target_feature = "avx2")] { true } #[cfg(not(target_feature = "avx2"))] { #[cfg(feature = "std")] { std::is_x86_feature_detected!("avx2") } #[cfg(not(feature = "std"))] { false } } } } } #[cfg(all( target_arch = "aarch64", target_feature = "neon", target_endian = "little" ))] mod aarch64 { use core::arch::aarch64::uint8x16_t; use alloc::sync::Arc; use crate::packed::{ pattern::Patterns, teddy::generic::{self, Match}, }; use super::{Searcher, SearcherT}; #[derive(Clone, Debug)] pub(super) struct SlimNeon { slim128: generic::Slim, } // Defines SlimSSSE3 wrapper functions for 1, 2, 3 and 4 bytes. macro_rules! slim_neon { ($len:expr) => { impl SlimNeon<$len> { /// Creates a new searcher using "slim" Teddy with 128-bit /// vectors. If SSSE3 is not available in the current /// environment, then this returns `None`. pub(super) fn new( patterns: &Arc, ) -> Option { Some(unsafe { SlimNeon::<$len>::new_unchecked(patterns) }) } /// Creates a new searcher using "slim" Teddy with 256-bit /// vectors without checking whether SSSE3 is available or not. /// /// # Safety /// /// Callers must ensure that SSSE3 is available in the current /// environment. #[target_feature(enable = "neon")] unsafe fn new_unchecked(patterns: &Arc) -> Searcher { let slim128 = generic::Slim::::new( Arc::clone(patterns), ); let memory_usage = slim128.memory_usage(); let minimum_len = slim128.minimum_len(); let imp = Arc::new(SlimNeon { slim128 }); Searcher { imp, memory_usage, minimum_len } } } impl SearcherT for SlimNeon<$len> { #[target_feature(enable = "neon")] #[inline] unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { // SAFETY: All obligations except for `target_feature` are // passed to the caller. Our use of `target_feature` is // safe because construction of this type requires that the // requisite target features are available. self.slim128.find(start, end) } } }; } slim_neon!(1); slim_neon!(2); slim_neon!(3); slim_neon!(4); } aho-corasick-1.1.3/src/packed/teddy/generic.rs000064400000000000000000001400751046102023000173110ustar 00000000000000use core::fmt::Debug; use alloc::{ boxed::Box, collections::BTreeMap, format, sync::Arc, vec, vec::Vec, }; use crate::{ packed::{ ext::Pointer, pattern::Patterns, vector::{FatVector, Vector}, }, util::int::U32, PatternID, }; /// A match type specialized to the Teddy implementations below. /// /// Essentially, instead of representing a match at byte offsets, we use /// raw pointers. This is because the implementations below operate on raw /// pointers, and so this is a more natural return type based on how the /// implementation works. /// /// Also, the `PatternID` used here is a `u16`. #[derive(Clone, Copy, Debug)] pub(crate) struct Match { pid: PatternID, start: *const u8, end: *const u8, } impl Match { /// Returns the ID of the pattern that matched. pub(crate) fn pattern(&self) -> PatternID { self.pid } /// Returns a pointer into the haystack at which the match starts. pub(crate) fn start(&self) -> *const u8 { self.start } /// Returns a pointer into the haystack at which the match ends. pub(crate) fn end(&self) -> *const u8 { self.end } } /// A "slim" Teddy implementation that is generic over both the vector type /// and the minimum length of the patterns being searched for. /// /// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. #[derive(Clone, Debug)] pub(crate) struct Slim { /// A generic data structure for doing "slim" Teddy verification. teddy: Teddy<8>, /// The masks used as inputs to the shuffle operation to generate /// candidates (which are fed into the verification routines). masks: [Mask; BYTES], } impl Slim { /// Create a new "slim" Teddy searcher for the given patterns. /// /// # Panics /// /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] pub(crate) unsafe fn new(patterns: Arc) -> Slim { assert!( 1 <= BYTES && BYTES <= 4, "only 1, 2, 3 or 4 bytes are supported" ); let teddy = Teddy::new(patterns); let masks = SlimMaskBuilder::from_teddy(&teddy); Slim { teddy, masks } } /// Returns the approximate total amount of heap used by this type, in /// units of bytes. #[inline(always)] pub(crate) fn memory_usage(&self) -> usize { self.teddy.memory_usage() } /// Returns the minimum length, in bytes, that a haystack must be in order /// to use it with this searcher. #[inline(always)] pub(crate) fn minimum_len(&self) -> usize { V::BYTES + (BYTES - 1) } } impl Slim { /// Look for an occurrences of the patterns in this finder in the haystack /// given by the `start` and `end` pointers. /// /// If no match could be found, then `None` is returned. /// /// # Safety /// /// The given pointers representing the haystack must be valid to read /// from. They must also point to a region of memory that is at least the /// minimum length required by this searcher. /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] pub(crate) unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { let len = end.distance(start); debug_assert!(len >= self.minimum_len()); let mut cur = start; while cur <= end.sub(V::BYTES) { if let Some(m) = self.find_one(cur, end) { return Some(m); } cur = cur.add(V::BYTES); } if cur < end { cur = end.sub(V::BYTES); if let Some(m) = self.find_one(cur, end) { return Some(m); } } None } /// Look for a match starting at the `V::BYTES` at and after `cur`. If /// there isn't one, then `None` is returned. /// /// # Safety /// /// The given pointers representing the haystack must be valid to read /// from. They must also point to a region of memory that is at least the /// minimum length required by this searcher. /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] unsafe fn find_one( &self, cur: *const u8, end: *const u8, ) -> Option { let c = self.candidate(cur); if !c.is_zero() { if let Some(m) = self.teddy.verify(cur, end, c) { return Some(m); } } None } /// Look for a candidate match (represented as a vector) starting at the /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with /// all bits set to zero is returned. /// /// # Safety /// /// The given pointer representing the haystack must be valid to read /// from. /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] unsafe fn candidate(&self, cur: *const u8) -> V { let chunk = V::load_unaligned(cur); Mask::members1(chunk, self.masks) } } impl Slim { /// See Slim::find. #[inline(always)] pub(crate) unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { let len = end.distance(start); debug_assert!(len >= self.minimum_len()); let mut cur = start.add(1); let mut prev0 = V::splat(0xFF); while cur <= end.sub(V::BYTES) { if let Some(m) = self.find_one(cur, end, &mut prev0) { return Some(m); } cur = cur.add(V::BYTES); } if cur < end { cur = end.sub(V::BYTES); prev0 = V::splat(0xFF); if let Some(m) = self.find_one(cur, end, &mut prev0) { return Some(m); } } None } /// See Slim::find_one. #[inline(always)] unsafe fn find_one( &self, cur: *const u8, end: *const u8, prev0: &mut V, ) -> Option { let c = self.candidate(cur, prev0); if !c.is_zero() { if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { return Some(m); } } None } /// See Slim::candidate. #[inline(always)] unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { let chunk = V::load_unaligned(cur); let (res0, res1) = Mask::members2(chunk, self.masks); let res0prev0 = res0.shift_in_one_byte(*prev0); let res = res0prev0.and(res1); *prev0 = res0; res } } impl Slim { /// See Slim::find. #[inline(always)] pub(crate) unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { let len = end.distance(start); debug_assert!(len >= self.minimum_len()); let mut cur = start.add(2); let mut prev0 = V::splat(0xFF); let mut prev1 = V::splat(0xFF); while cur <= end.sub(V::BYTES) { if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { return Some(m); } cur = cur.add(V::BYTES); } if cur < end { cur = end.sub(V::BYTES); prev0 = V::splat(0xFF); prev1 = V::splat(0xFF); if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { return Some(m); } } None } /// See Slim::find_one. #[inline(always)] unsafe fn find_one( &self, cur: *const u8, end: *const u8, prev0: &mut V, prev1: &mut V, ) -> Option { let c = self.candidate(cur, prev0, prev1); if !c.is_zero() { if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { return Some(m); } } None } /// See Slim::candidate. #[inline(always)] unsafe fn candidate( &self, cur: *const u8, prev0: &mut V, prev1: &mut V, ) -> V { let chunk = V::load_unaligned(cur); let (res0, res1, res2) = Mask::members3(chunk, self.masks); let res0prev0 = res0.shift_in_two_bytes(*prev0); let res1prev1 = res1.shift_in_one_byte(*prev1); let res = res0prev0.and(res1prev1).and(res2); *prev0 = res0; *prev1 = res1; res } } impl Slim { /// See Slim::find. #[inline(always)] pub(crate) unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { let len = end.distance(start); debug_assert!(len >= self.minimum_len()); let mut cur = start.add(3); let mut prev0 = V::splat(0xFF); let mut prev1 = V::splat(0xFF); let mut prev2 = V::splat(0xFF); while cur <= end.sub(V::BYTES) { if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) { return Some(m); } cur = cur.add(V::BYTES); } if cur < end { cur = end.sub(V::BYTES); prev0 = V::splat(0xFF); prev1 = V::splat(0xFF); prev2 = V::splat(0xFF); if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) { return Some(m); } } None } /// See Slim::find_one. #[inline(always)] unsafe fn find_one( &self, cur: *const u8, end: *const u8, prev0: &mut V, prev1: &mut V, prev2: &mut V, ) -> Option { let c = self.candidate(cur, prev0, prev1, prev2); if !c.is_zero() { if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { return Some(m); } } None } /// See Slim::candidate. #[inline(always)] unsafe fn candidate( &self, cur: *const u8, prev0: &mut V, prev1: &mut V, prev2: &mut V, ) -> V { let chunk = V::load_unaligned(cur); let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); let res0prev0 = res0.shift_in_three_bytes(*prev0); let res1prev1 = res1.shift_in_two_bytes(*prev1); let res2prev2 = res2.shift_in_one_byte(*prev2); let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); *prev0 = res0; *prev1 = res1; *prev2 = res2; res } } /// A "fat" Teddy implementation that is generic over both the vector type /// and the minimum length of the patterns being searched for. /// /// Only 1, 2, 3 and 4 bytes are supported as minimum lengths. #[derive(Clone, Debug)] pub(crate) struct Fat { /// A generic data structure for doing "fat" Teddy verification. teddy: Teddy<16>, /// The masks used as inputs to the shuffle operation to generate /// candidates (which are fed into the verification routines). masks: [Mask; BYTES], } impl Fat { /// Create a new "fat" Teddy searcher for the given patterns. /// /// # Panics /// /// This panics when `BYTES` is any value other than 1, 2, 3 or 4. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] pub(crate) unsafe fn new(patterns: Arc) -> Fat { assert!( 1 <= BYTES && BYTES <= 4, "only 1, 2, 3 or 4 bytes are supported" ); let teddy = Teddy::new(patterns); let masks = FatMaskBuilder::from_teddy(&teddy); Fat { teddy, masks } } /// Returns the approximate total amount of heap used by this type, in /// units of bytes. #[inline(always)] pub(crate) fn memory_usage(&self) -> usize { self.teddy.memory_usage() } /// Returns the minimum length, in bytes, that a haystack must be in order /// to use it with this searcher. #[inline(always)] pub(crate) fn minimum_len(&self) -> usize { V::Half::BYTES + (BYTES - 1) } } impl Fat { /// Look for an occurrences of the patterns in this finder in the haystack /// given by the `start` and `end` pointers. /// /// If no match could be found, then `None` is returned. /// /// # Safety /// /// The given pointers representing the haystack must be valid to read /// from. They must also point to a region of memory that is at least the /// minimum length required by this searcher. /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] pub(crate) unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { let len = end.distance(start); debug_assert!(len >= self.minimum_len()); let mut cur = start; while cur <= end.sub(V::Half::BYTES) { if let Some(m) = self.find_one(cur, end) { return Some(m); } cur = cur.add(V::Half::BYTES); } if cur < end { cur = end.sub(V::Half::BYTES); if let Some(m) = self.find_one(cur, end) { return Some(m); } } None } /// Look for a match starting at the `V::BYTES` at and after `cur`. If /// there isn't one, then `None` is returned. /// /// # Safety /// /// The given pointers representing the haystack must be valid to read /// from. They must also point to a region of memory that is at least the /// minimum length required by this searcher. /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] unsafe fn find_one( &self, cur: *const u8, end: *const u8, ) -> Option { let c = self.candidate(cur); if !c.is_zero() { if let Some(m) = self.teddy.verify(cur, end, c) { return Some(m); } } None } /// Look for a candidate match (represented as a vector) starting at the /// `V::BYTES` at and after `cur`. If there isn't one, then a vector with /// all bits set to zero is returned. /// /// # Safety /// /// The given pointer representing the haystack must be valid to read /// from. /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] unsafe fn candidate(&self, cur: *const u8) -> V { let chunk = V::load_half_unaligned(cur); Mask::members1(chunk, self.masks) } } impl Fat { /// See `Fat::find`. #[inline(always)] pub(crate) unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { let len = end.distance(start); debug_assert!(len >= self.minimum_len()); let mut cur = start.add(1); let mut prev0 = V::splat(0xFF); while cur <= end.sub(V::Half::BYTES) { if let Some(m) = self.find_one(cur, end, &mut prev0) { return Some(m); } cur = cur.add(V::Half::BYTES); } if cur < end { cur = end.sub(V::Half::BYTES); prev0 = V::splat(0xFF); if let Some(m) = self.find_one(cur, end, &mut prev0) { return Some(m); } } None } /// See `Fat::find_one`. #[inline(always)] unsafe fn find_one( &self, cur: *const u8, end: *const u8, prev0: &mut V, ) -> Option { let c = self.candidate(cur, prev0); if !c.is_zero() { if let Some(m) = self.teddy.verify(cur.sub(1), end, c) { return Some(m); } } None } /// See `Fat::candidate`. #[inline(always)] unsafe fn candidate(&self, cur: *const u8, prev0: &mut V) -> V { let chunk = V::load_half_unaligned(cur); let (res0, res1) = Mask::members2(chunk, self.masks); let res0prev0 = res0.half_shift_in_one_byte(*prev0); let res = res0prev0.and(res1); *prev0 = res0; res } } impl Fat { /// See `Fat::find`. #[inline(always)] pub(crate) unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { let len = end.distance(start); debug_assert!(len >= self.minimum_len()); let mut cur = start.add(2); let mut prev0 = V::splat(0xFF); let mut prev1 = V::splat(0xFF); while cur <= end.sub(V::Half::BYTES) { if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { return Some(m); } cur = cur.add(V::Half::BYTES); } if cur < end { cur = end.sub(V::Half::BYTES); prev0 = V::splat(0xFF); prev1 = V::splat(0xFF); if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1) { return Some(m); } } None } /// See `Fat::find_one`. #[inline(always)] unsafe fn find_one( &self, cur: *const u8, end: *const u8, prev0: &mut V, prev1: &mut V, ) -> Option { let c = self.candidate(cur, prev0, prev1); if !c.is_zero() { if let Some(m) = self.teddy.verify(cur.sub(2), end, c) { return Some(m); } } None } /// See `Fat::candidate`. #[inline(always)] unsafe fn candidate( &self, cur: *const u8, prev0: &mut V, prev1: &mut V, ) -> V { let chunk = V::load_half_unaligned(cur); let (res0, res1, res2) = Mask::members3(chunk, self.masks); let res0prev0 = res0.half_shift_in_two_bytes(*prev0); let res1prev1 = res1.half_shift_in_one_byte(*prev1); let res = res0prev0.and(res1prev1).and(res2); *prev0 = res0; *prev1 = res1; res } } impl Fat { /// See `Fat::find`. #[inline(always)] pub(crate) unsafe fn find( &self, start: *const u8, end: *const u8, ) -> Option { let len = end.distance(start); debug_assert!(len >= self.minimum_len()); let mut cur = start.add(3); let mut prev0 = V::splat(0xFF); let mut prev1 = V::splat(0xFF); let mut prev2 = V::splat(0xFF); while cur <= end.sub(V::Half::BYTES) { if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) { return Some(m); } cur = cur.add(V::Half::BYTES); } if cur < end { cur = end.sub(V::Half::BYTES); prev0 = V::splat(0xFF); prev1 = V::splat(0xFF); prev2 = V::splat(0xFF); if let Some(m) = self.find_one(cur, end, &mut prev0, &mut prev1, &mut prev2) { return Some(m); } } None } /// See `Fat::find_one`. #[inline(always)] unsafe fn find_one( &self, cur: *const u8, end: *const u8, prev0: &mut V, prev1: &mut V, prev2: &mut V, ) -> Option { let c = self.candidate(cur, prev0, prev1, prev2); if !c.is_zero() { if let Some(m) = self.teddy.verify(cur.sub(3), end, c) { return Some(m); } } None } /// See `Fat::candidate`. #[inline(always)] unsafe fn candidate( &self, cur: *const u8, prev0: &mut V, prev1: &mut V, prev2: &mut V, ) -> V { let chunk = V::load_half_unaligned(cur); let (res0, res1, res2, res3) = Mask::members4(chunk, self.masks); let res0prev0 = res0.half_shift_in_three_bytes(*prev0); let res1prev1 = res1.half_shift_in_two_bytes(*prev1); let res2prev2 = res2.half_shift_in_one_byte(*prev2); let res = res0prev0.and(res1prev1).and(res2prev2).and(res3); *prev0 = res0; *prev1 = res1; *prev2 = res2; res } } /// The common elements of all "slim" and "fat" Teddy search implementations. /// /// Essentially, this contains the patterns and the buckets. Namely, it /// contains enough to implement the verification step after candidates are /// identified via the shuffle masks. /// /// It is generic over the number of buckets used. In general, the number of /// buckets is either 8 (for "slim" Teddy) or 16 (for "fat" Teddy). The generic /// parameter isn't really meant to be instantiated for any value other than /// 8 or 16, although it is technically possible. The main hiccup is that there /// is some bit-shifting done in the critical part of verification that could /// be quite expensive if `N` is not a multiple of 2. #[derive(Clone, Debug)] struct Teddy { /// The patterns we are searching for. /// /// A pattern string can be found by its `PatternID`. patterns: Arc, /// The allocation of patterns in buckets. This only contains the IDs of /// patterns. In order to do full verification, callers must provide the /// actual patterns when using Teddy. buckets: [Vec; BUCKETS], // N.B. The above representation is very simple, but it definitely results // in ping-ponging between different allocations during verification. I've // tried experimenting with other representations that flatten the pattern // strings into a single allocation, but it doesn't seem to help much. // Probably everything is small enough to fit into cache anyway, and so the // pointer chasing isn't a big deal? // // One other avenue I haven't explored is some kind of hashing trick // that let's us do another high-confidence check before launching into // `memcmp`. } impl Teddy { /// Create a new generic data structure for Teddy verification. fn new(patterns: Arc) -> Teddy { assert_ne!(0, patterns.len(), "Teddy requires at least one pattern"); assert_ne!( 0, patterns.minimum_len(), "Teddy does not support zero-length patterns" ); assert!( BUCKETS == 8 || BUCKETS == 16, "Teddy only supports 8 or 16 buckets" ); // MSRV(1.63): Use core::array::from_fn below instead of allocating a // superfluous outer Vec. Not a big deal (especially given the BTreeMap // allocation below), but nice to not do it. let buckets = <[Vec; BUCKETS]>::try_from(vec![vec![]; BUCKETS]) .unwrap(); let mut t = Teddy { patterns, buckets }; let mut map: BTreeMap, usize> = BTreeMap::new(); for (id, pattern) in t.patterns.iter() { // We try to be slightly clever in how we assign patterns into // buckets. Generally speaking, we want patterns with the same // prefix to be in the same bucket, since it minimizes the amount // of time we spend churning through buckets in the verification // step. // // So we could assign patterns with the same N-prefix (where N is // the size of the mask, which is one of {1, 2, 3}) to the same // bucket. However, case insensitive searches are fairly common, so // we'd for example, ideally want to treat `abc` and `ABC` as if // they shared the same prefix. ASCII has the nice property that // the lower 4 bits of A and a are the same, so we therefore group // patterns with the same low-nybble-N-prefix into the same bucket. // // MOREOVER, this is actually necessary for correctness! In // particular, by grouping patterns with the same prefix into the // same bucket, we ensure that we preserve correct leftmost-first // and leftmost-longest match semantics. In addition to the fact // that `patterns.iter()` iterates in the correct order, this // guarantees that all possible ambiguous matches will occur in // the same bucket. The verification routine could be adjusted to // support correct leftmost match semantics regardless of bucket // allocation, but that results in a performance hit. It's much // nicer to be able to just stop as soon as a match is found. let lonybs = pattern.low_nybbles(t.mask_len()); if let Some(&bucket) = map.get(&lonybs) { t.buckets[bucket].push(id); } else { // N.B. We assign buckets in reverse because it shouldn't have // any influence on performance, but it does make it harder to // get leftmost match semantics accidentally correct. let bucket = (BUCKETS - 1) - (id.as_usize() % BUCKETS); t.buckets[bucket].push(id); map.insert(lonybs, bucket); } } t } /// Verify whether there are any matches starting at or after `cur` in the /// haystack. The candidate chunk given should correspond to 8-bit bitsets /// for N buckets. /// /// # Safety /// /// The given pointers representing the haystack must be valid to read /// from. #[inline(always)] unsafe fn verify64( &self, cur: *const u8, end: *const u8, mut candidate_chunk: u64, ) -> Option { while candidate_chunk != 0 { let bit = candidate_chunk.trailing_zeros().as_usize(); candidate_chunk &= !(1 << bit); let cur = cur.add(bit / BUCKETS); let bucket = bit % BUCKETS; if let Some(m) = self.verify_bucket(cur, end, bucket) { return Some(m); } } None } /// Verify whether there are any matches starting at `at` in the given /// `haystack` corresponding only to patterns in the given bucket. /// /// # Safety /// /// The given pointers representing the haystack must be valid to read /// from. /// /// The bucket index must be less than or equal to `self.buckets.len()`. #[inline(always)] unsafe fn verify_bucket( &self, cur: *const u8, end: *const u8, bucket: usize, ) -> Option { debug_assert!(bucket < self.buckets.len()); // SAFETY: The caller must ensure that the bucket index is correct. for pid in self.buckets.get_unchecked(bucket).iter().copied() { // SAFETY: This is safe because we are guaranteed that every // index in a Teddy bucket is a valid index into `pats`, by // construction. debug_assert!(pid.as_usize() < self.patterns.len()); let pat = self.patterns.get_unchecked(pid); if pat.is_prefix_raw(cur, end) { let start = cur; let end = start.add(pat.len()); return Some(Match { pid, start, end }); } } None } /// Returns the total number of masks required by the patterns in this /// Teddy searcher. /// /// Basically, the mask length corresponds to the type of Teddy searcher /// to use: a 1-byte, 2-byte, 3-byte or 4-byte searcher. The bigger the /// better, typically, since searching for longer substrings usually /// decreases the rate of false positives. Therefore, the number of masks /// needed is the length of the shortest pattern in this searcher. If the /// length of the shortest pattern (in bytes) is bigger than 4, then the /// mask length is 4 since there are no Teddy searchers for more than 4 /// bytes. fn mask_len(&self) -> usize { core::cmp::min(4, self.patterns.minimum_len()) } /// Returns the approximate total amount of heap used by this type, in /// units of bytes. fn memory_usage(&self) -> usize { // This is an upper bound rather than a precise accounting. No // particular reason, other than it's probably very close to actual // memory usage in practice. self.patterns.len() * core::mem::size_of::() } } impl Teddy<8> { /// Runs the verification routine for "slim" Teddy. /// /// The candidate given should be a collection of 8-bit bitsets (one bitset /// per lane), where the ith bit is set in the jth lane if and only if the /// byte occurring at `at + j` in `cur` is in the bucket `i`. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. /// /// The given pointers must be valid to read from. #[inline(always)] unsafe fn verify( &self, mut cur: *const u8, end: *const u8, candidate: V, ) -> Option { debug_assert!(!candidate.is_zero()); // Convert the candidate into 64-bit chunks, and then verify each of // those chunks. candidate.for_each_64bit_lane( #[inline(always)] |_, chunk| { let result = self.verify64(cur, end, chunk); cur = cur.add(8); result }, ) } } impl Teddy<16> { /// Runs the verification routine for "fat" Teddy. /// /// The candidate given should be a collection of 8-bit bitsets (one bitset /// per lane), where the ith bit is set in the jth lane if and only if the /// byte occurring at `at + (j < 16 ? j : j - 16)` in `cur` is in the /// bucket `j < 16 ? i : i + 8`. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. /// /// The given pointers must be valid to read from. #[inline(always)] unsafe fn verify( &self, mut cur: *const u8, end: *const u8, candidate: V, ) -> Option { // This is a bit tricky, but we basically want to convert our // candidate, which looks like this (assuming a 256-bit vector): // // a31 a30 ... a17 a16 a15 a14 ... a01 a00 // // where each a(i) is an 8-bit bitset corresponding to the activated // buckets, to this // // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 // // Namely, for Fat Teddy, the high 128-bits of the candidate correspond // to the same bytes in the haystack in the low 128-bits (so we only // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. // // The verification routine wants to look at all potentially matching // buckets before moving on to the next lane. So for example, both // a16 and a00 both correspond to the first byte in our window; a00 // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, // a16 should be checked before a01. So the transformation shown above // allows us to use our normal verification procedure with one small // change: we treat each bitset as 16 bits instead of 8 bits. debug_assert!(!candidate.is_zero()); // Swap the 128-bit lanes in the candidate vector. let swapped = candidate.swap_halves(); // Interleave the bytes from the low 128-bit lanes, starting with // cand first. let r1 = candidate.interleave_low_8bit_lanes(swapped); // Interleave the bytes from the high 128-bit lanes, starting with // cand first. let r2 = candidate.interleave_high_8bit_lanes(swapped); // Now just take the 2 low 64-bit integers from both r1 and r2. We // can drop the high 64-bit integers because they are a mirror image // of the low 64-bit integers. All we care about are the low 128-bit // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets // laid out in the desired order, as described above. r1.for_each_low_64bit_lane( r2, #[inline(always)] |_, chunk| { let result = self.verify64(cur, end, chunk); cur = cur.add(4); result }, ) } } /// A vector generic mask for the low and high nybbles in a set of patterns. /// Each 8-bit lane `j` in a vector corresponds to a bitset where the `i`th bit /// is set if and only if the nybble `j` is in the bucket `i` at a particular /// position. /// /// This is slightly tweaked dependending on whether Slim or Fat Teddy is being /// used. For Slim Teddy, the bitsets in the lower half are the same as the /// bitsets in the higher half, so that we can search `V::BYTES` bytes at a /// time. (Remember, the nybbles in the haystack are used as indices into these /// masks, and 256-bit shuffles only operate on 128-bit lanes.) /// /// For Fat Teddy, the bitsets are not repeated, but instead, the high half /// bits correspond to an addition 8 buckets. So that a bitset `00100010` has /// buckets 1 and 5 set if it's in the lower half, but has buckets 9 and 13 set /// if it's in the higher half. #[derive(Clone, Copy, Debug)] struct Mask { lo: V, hi: V, } impl Mask { /// Return a candidate for Teddy (fat or slim) that is searching for 1-byte /// candidates. /// /// If a candidate is returned, it will be a collection of 8-bit bitsets /// (one bitset per lane), where the ith bit is set in the jth lane if and /// only if the byte occurring at the jth lane in `chunk` is in the bucket /// `i`. If no candidate is found, then the vector returned will have all /// lanes set to zero. /// /// `chunk` should correspond to a `V::BYTES` window of the haystack (where /// the least significant byte corresponds to the start of the window). For /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with /// the window repeated in each half of the vector. /// /// `mask1` should correspond to a low/high mask for the first byte of all /// patterns that are being searched. #[inline(always)] unsafe fn members1(chunk: V, masks: [Mask; 1]) -> V { let lomask = V::splat(0xF); let hlo = chunk.and(lomask); let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); let locand = masks[0].lo.shuffle_bytes(hlo); let hicand = masks[0].hi.shuffle_bytes(hhi); locand.and(hicand) } /// Return a candidate for Teddy (fat or slim) that is searching for 2-byte /// candidates. /// /// If candidates are returned, each will be a collection of 8-bit bitsets /// (one bitset per lane), where the ith bit is set in the jth lane if and /// only if the byte occurring at the jth lane in `chunk` is in the bucket /// `i`. Each candidate returned corresponds to the first and second bytes /// of the patterns being searched. If no candidate is found, then all of /// the lanes will be set to zero in at least one of the vectors returned. /// /// `chunk` should correspond to a `V::BYTES` window of the haystack (where /// the least significant byte corresponds to the start of the window). For /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with /// the window repeated in each half of the vector. /// /// The masks should correspond to the masks computed for the first and /// second bytes of all patterns that are being searched. #[inline(always)] unsafe fn members2(chunk: V, masks: [Mask; 2]) -> (V, V) { let lomask = V::splat(0xF); let hlo = chunk.and(lomask); let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); let locand1 = masks[0].lo.shuffle_bytes(hlo); let hicand1 = masks[0].hi.shuffle_bytes(hhi); let cand1 = locand1.and(hicand1); let locand2 = masks[1].lo.shuffle_bytes(hlo); let hicand2 = masks[1].hi.shuffle_bytes(hhi); let cand2 = locand2.and(hicand2); (cand1, cand2) } /// Return a candidate for Teddy (fat or slim) that is searching for 3-byte /// candidates. /// /// If candidates are returned, each will be a collection of 8-bit bitsets /// (one bitset per lane), where the ith bit is set in the jth lane if and /// only if the byte occurring at the jth lane in `chunk` is in the bucket /// `i`. Each candidate returned corresponds to the first, second and third /// bytes of the patterns being searched. If no candidate is found, then /// all of the lanes will be set to zero in at least one of the vectors /// returned. /// /// `chunk` should correspond to a `V::BYTES` window of the haystack (where /// the least significant byte corresponds to the start of the window). For /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with /// the window repeated in each half of the vector. /// /// The masks should correspond to the masks computed for the first, second /// and third bytes of all patterns that are being searched. #[inline(always)] unsafe fn members3(chunk: V, masks: [Mask; 3]) -> (V, V, V) { let lomask = V::splat(0xF); let hlo = chunk.and(lomask); let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); let locand1 = masks[0].lo.shuffle_bytes(hlo); let hicand1 = masks[0].hi.shuffle_bytes(hhi); let cand1 = locand1.and(hicand1); let locand2 = masks[1].lo.shuffle_bytes(hlo); let hicand2 = masks[1].hi.shuffle_bytes(hhi); let cand2 = locand2.and(hicand2); let locand3 = masks[2].lo.shuffle_bytes(hlo); let hicand3 = masks[2].hi.shuffle_bytes(hhi); let cand3 = locand3.and(hicand3); (cand1, cand2, cand3) } /// Return a candidate for Teddy (fat or slim) that is searching for 4-byte /// candidates. /// /// If candidates are returned, each will be a collection of 8-bit bitsets /// (one bitset per lane), where the ith bit is set in the jth lane if and /// only if the byte occurring at the jth lane in `chunk` is in the bucket /// `i`. Each candidate returned corresponds to the first, second, third /// and fourth bytes of the patterns being searched. If no candidate is /// found, then all of the lanes will be set to zero in at least one of the /// vectors returned. /// /// `chunk` should correspond to a `V::BYTES` window of the haystack (where /// the least significant byte corresponds to the start of the window). For /// fat Teddy, the haystack window length should be `V::BYTES / 2`, with /// the window repeated in each half of the vector. /// /// The masks should correspond to the masks computed for the first, /// second, third and fourth bytes of all patterns that are being searched. #[inline(always)] unsafe fn members4(chunk: V, masks: [Mask; 4]) -> (V, V, V, V) { let lomask = V::splat(0xF); let hlo = chunk.and(lomask); let hhi = chunk.shift_8bit_lane_right::<4>().and(lomask); let locand1 = masks[0].lo.shuffle_bytes(hlo); let hicand1 = masks[0].hi.shuffle_bytes(hhi); let cand1 = locand1.and(hicand1); let locand2 = masks[1].lo.shuffle_bytes(hlo); let hicand2 = masks[1].hi.shuffle_bytes(hhi); let cand2 = locand2.and(hicand2); let locand3 = masks[2].lo.shuffle_bytes(hlo); let hicand3 = masks[2].hi.shuffle_bytes(hhi); let cand3 = locand3.and(hicand3); let locand4 = masks[3].lo.shuffle_bytes(hlo); let hicand4 = masks[3].hi.shuffle_bytes(hhi); let cand4 = locand4.and(hicand4); (cand1, cand2, cand3, cand4) } } /// Represents the low and high nybble masks that will be used during /// search. Each mask is 32 bytes wide, although only the first 16 bytes are /// used for 128-bit vectors. /// /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set /// if and only if the corresponding nybble is in the ith bucket. The index of /// the byte (0-15, inclusive) corresponds to the nybble. /// /// Each mask is used as the target of a shuffle, where the indices for the /// shuffle are taken from the haystack. AND'ing the shuffles for both the /// low and high masks together also results in 8-bit bitsets, but where bit /// `i` is set if and only if the correspond *byte* is in the ith bucket. #[derive(Clone, Default)] struct SlimMaskBuilder { lo: [u8; 32], hi: [u8; 32], } impl SlimMaskBuilder { /// Update this mask by adding the given byte to the given bucket. The /// given bucket must be in the range 0-7. /// /// # Panics /// /// When `bucket >= 8`. fn add(&mut self, bucket: usize, byte: u8) { assert!(bucket < 8); let bucket = u8::try_from(bucket).unwrap(); let byte_lo = usize::from(byte & 0xF); let byte_hi = usize::from((byte >> 4) & 0xF); // When using 256-bit vectors, we need to set this bucket assignment in // the low and high 128-bit portions of the mask. This allows us to // process 32 bytes at a time. Namely, AVX2 shuffles operate on each // of the 128-bit lanes, rather than the full 256-bit vector at once. self.lo[byte_lo] |= 1 << bucket; self.lo[byte_lo + 16] |= 1 << bucket; self.hi[byte_hi] |= 1 << bucket; self.hi[byte_hi + 16] |= 1 << bucket; } /// Turn this builder into a vector mask. /// /// # Panics /// /// When `V` represents a vector bigger than what `MaskBytes` can contain. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] unsafe fn build(&self) -> Mask { assert!(V::BYTES <= self.lo.len()); assert!(V::BYTES <= self.hi.len()); Mask { lo: V::load_unaligned(self.lo[..].as_ptr()), hi: V::load_unaligned(self.hi[..].as_ptr()), } } /// A convenience function for building `N` vector masks from a slim /// `Teddy` value. /// /// # Panics /// /// When `V` represents a vector bigger than what `MaskBytes` can contain. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] unsafe fn from_teddy( teddy: &Teddy<8>, ) -> [Mask; BYTES] { // MSRV(1.63): Use core::array::from_fn to just build the array here // instead of creating a vector and turning it into an array. let mut mask_builders = vec![SlimMaskBuilder::default(); BYTES]; for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { for pid in bucket.iter().copied() { let pat = teddy.patterns.get(pid); for (i, builder) in mask_builders.iter_mut().enumerate() { builder.add(bucket_index, pat.bytes()[i]); } } } let array = <[SlimMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); array.map(|builder| builder.build()) } } impl Debug for SlimMaskBuilder { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let (mut parts_lo, mut parts_hi) = (vec![], vec![]); for i in 0..32 { parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); } f.debug_struct("SlimMaskBuilder") .field("lo", &parts_lo) .field("hi", &parts_hi) .finish() } } /// Represents the low and high nybble masks that will be used during "fat" /// Teddy search. /// /// Each mask is 32 bytes wide, and at the time of writing, only 256-bit vectors /// support fat Teddy. /// /// A fat Teddy mask is like a slim Teddy mask, except that instead of /// repeating the bitsets in the high and low 128-bits in 256-bit vectors, the /// high and low 128-bit halves each represent distinct buckets. (Bringing the /// total to 16 instead of 8.) This permits spreading the patterns out a bit /// more and thus putting less pressure on verification to be fast. /// /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set /// if and only if the corresponding nybble is in the ith bucket. The index of /// the byte (0-15, inclusive) corresponds to the nybble. #[derive(Clone, Copy, Default)] struct FatMaskBuilder { lo: [u8; 32], hi: [u8; 32], } impl FatMaskBuilder { /// Update this mask by adding the given byte to the given bucket. The /// given bucket must be in the range 0-15. /// /// # Panics /// /// When `bucket >= 16`. fn add(&mut self, bucket: usize, byte: u8) { assert!(bucket < 16); let bucket = u8::try_from(bucket).unwrap(); let byte_lo = usize::from(byte & 0xF); let byte_hi = usize::from((byte >> 4) & 0xF); // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, // the high 128 bits of our mask correspond to buckets 8-15, while the // low 128 bits correspond to buckets 0-7. if bucket < 8 { self.lo[byte_lo] |= 1 << bucket; self.hi[byte_hi] |= 1 << bucket; } else { self.lo[byte_lo + 16] |= 1 << (bucket % 8); self.hi[byte_hi + 16] |= 1 << (bucket % 8); } } /// Turn this builder into a vector mask. /// /// # Panics /// /// When `V` represents a vector bigger than what `MaskBytes` can contain. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] unsafe fn build(&self) -> Mask { assert!(V::BYTES <= self.lo.len()); assert!(V::BYTES <= self.hi.len()); Mask { lo: V::load_unaligned(self.lo[..].as_ptr()), hi: V::load_unaligned(self.hi[..].as_ptr()), } } /// A convenience function for building `N` vector masks from a fat /// `Teddy` value. /// /// # Panics /// /// When `V` represents a vector bigger than what `MaskBytes` can contain. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[inline(always)] unsafe fn from_teddy( teddy: &Teddy<16>, ) -> [Mask; BYTES] { // MSRV(1.63): Use core::array::from_fn to just build the array here // instead of creating a vector and turning it into an array. let mut mask_builders = vec![FatMaskBuilder::default(); BYTES]; for (bucket_index, bucket) in teddy.buckets.iter().enumerate() { for pid in bucket.iter().copied() { let pat = teddy.patterns.get(pid); for (i, builder) in mask_builders.iter_mut().enumerate() { builder.add(bucket_index, pat.bytes()[i]); } } } let array = <[FatMaskBuilder; BYTES]>::try_from(mask_builders).unwrap(); array.map(|builder| builder.build()) } } impl Debug for FatMaskBuilder { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let (mut parts_lo, mut parts_hi) = (vec![], vec![]); for i in 0..32 { parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); } f.debug_struct("FatMaskBuilder") .field("lo", &parts_lo) .field("hi", &parts_hi) .finish() } } aho-corasick-1.1.3/src/packed/teddy/mod.rs000064400000000000000000000005141046102023000164450ustar 00000000000000// Regrettable, but Teddy stuff just isn't used on all targets. And for some // targets, like aarch64, only "slim" Teddy is used and so "fat" Teddy gets a // bunch of dead-code warnings. Just not worth trying to squash them. Blech. #![allow(dead_code)] pub(crate) use self::builder::{Builder, Searcher}; mod builder; mod generic; aho-corasick-1.1.3/src/packed/tests.rs000064400000000000000000000447561046102023000157370ustar 00000000000000use std::collections::HashMap; use alloc::{ format, string::{String, ToString}, vec, vec::Vec, }; use crate::{ packed::{Config, MatchKind}, util::search::Match, }; /// A description of a single test against a multi-pattern searcher. /// /// A single test may not necessarily pass on every configuration of a /// searcher. The tests are categorized and grouped appropriately below. #[derive(Clone, Debug, Eq, PartialEq)] struct SearchTest { /// The name of this test, for debugging. name: &'static str, /// The patterns to search for. patterns: &'static [&'static str], /// The text to search. haystack: &'static str, /// Each match is a triple of (pattern_index, start, end), where /// pattern_index is an index into `patterns` and `start`/`end` are indices /// into `haystack`. matches: &'static [(usize, usize, usize)], } struct SearchTestOwned { offset: usize, name: String, patterns: Vec, haystack: String, matches: Vec<(usize, usize, usize)>, } impl SearchTest { fn variations(&self) -> Vec { let count = if cfg!(miri) { 1 } else { 261 }; let mut tests = vec![]; for i in 0..count { tests.push(self.offset_prefix(i)); tests.push(self.offset_suffix(i)); tests.push(self.offset_both(i)); } tests } fn offset_both(&self, off: usize) -> SearchTestOwned { SearchTestOwned { offset: off, name: self.name.to_string(), patterns: self.patterns.iter().map(|s| s.to_string()).collect(), haystack: format!( "{}{}{}", "Z".repeat(off), self.haystack, "Z".repeat(off) ), matches: self .matches .iter() .map(|&(id, s, e)| (id, s + off, e + off)) .collect(), } } fn offset_prefix(&self, off: usize) -> SearchTestOwned { SearchTestOwned { offset: off, name: self.name.to_string(), patterns: self.patterns.iter().map(|s| s.to_string()).collect(), haystack: format!("{}{}", "Z".repeat(off), self.haystack), matches: self .matches .iter() .map(|&(id, s, e)| (id, s + off, e + off)) .collect(), } } fn offset_suffix(&self, off: usize) -> SearchTestOwned { SearchTestOwned { offset: off, name: self.name.to_string(), patterns: self.patterns.iter().map(|s| s.to_string()).collect(), haystack: format!("{}{}", self.haystack, "Z".repeat(off)), matches: self.matches.to_vec(), } } } /// Short-hand constructor for SearchTest. We use it a lot below. macro_rules! t { ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { SearchTest { name: stringify!($name), patterns: $patterns, haystack: $haystack, matches: $matches, } }; } /// A collection of test groups. type TestCollection = &'static [&'static [SearchTest]]; // Define several collections corresponding to the different type of match // semantics supported. These collections have some overlap, but each // collection should have some tests that no other collection has. /// Tests for leftmost-first match semantics. const PACKED_LEFTMOST_FIRST: TestCollection = &[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY]; /// Tests for leftmost-longest match semantics. const PACKED_LEFTMOST_LONGEST: TestCollection = &[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY]; // Now define the individual tests that make up the collections above. /// A collection of tests for the that should always be true regardless of /// match semantics. That is, all combinations of leftmost-{first, longest} /// should produce the same answer. const BASICS: &'static [SearchTest] = &[ t!(basic001, &["a"], "", &[]), t!(basic010, &["a"], "a", &[(0, 0, 1)]), t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), t!(basic050, &["a"], "bba", &[(0, 2, 3)]), t!(basic060, &["a"], "bbb", &[]), t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), t!(basic100, &["aa"], "", &[]), t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), t!(basic130, &["aa"], "abbab", &[]), t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]), t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), t!(basic230, &["abcd"], "abcd", &[(0, 0, 4)]), t!(basic240, &["abcd"], "zazabzabcdz", &[(0, 6, 10)]), t!(basic250, &["abcd"], "zazabcdzabcdz", &[(0, 3, 7), (0, 8, 12)]), t!(basic300, &["a", "b"], "", &[]), t!(basic310, &["a", "b"], "z", &[]), t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), t!( basic340, &["a", "b"], "abba", &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] ), t!( basic350, &["b", "a"], "abba", &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] ), t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), t!(basic400, &["foo", "bar"], "", &[]), t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), t!( basic720, &["yabcdef", "bcdeyabc", "abcdezghi"], "yabcdezghi", &[(2, 1, 10),] ), t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), t!( basic840, &["ab", "ba"], "abababa", &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] ), t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), ]; /// Tests for leftmost match semantics. These should pass for both /// leftmost-first and leftmost-longest match kinds. Stated differently, among /// ambiguous matches, the longest match and the match that appeared first when /// constructing the automaton should always be the same. const LEFTMOST: &'static [SearchTest] = &[ t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), t!( leftmost360, &["abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(2, 0, 8),] ), t!( leftmost370, &["abcdefghi", "cde", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost380, &["abcdefghi", "hz", "abcdefgh", "a"], "abcdefghz", &[(2, 0, 8),] ), t!( leftmost390, &["b", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost400, &["h", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost410, &["z", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8), (0, 8, 9),] ), ]; /// Tests for non-overlapping leftmost-first match semantics. These tests /// should generally be specific to leftmost-first, which means they should /// generally fail under leftmost-longest semantics. const LEFTMOST_FIRST: &'static [SearchTest] = &[ t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), t!( leftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[(1, 1, 2), (3, 2, 4),] ), t!( leftfirst320, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(0, 0, 1), (2, 7, 9),] ), t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), t!( leftfirst340, &["abcdef", "x", "x", "x", "x", "x", "x", "abcde"], "abcdef", &[(0, 0, 6)] ), ]; /// Tests for non-overlapping leftmost-longest match semantics. These tests /// should generally be specific to leftmost-longest, which means they should /// generally fail under leftmost-first semantics. const LEFTMOST_LONGEST: &'static [SearchTest] = &[ t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), t!( leftlong310, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), ]; /// Regression tests that are applied to all combinations. /// /// If regression tests are needed for specific match semantics, then add them /// to the appropriate group above. const REGRESSION: &'static [SearchTest] = &[ t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), t!( regression030, &["libcore/", "libstd/"], "libcore/char/methods.rs", &[(0, 0, 8),] ), t!( regression040, &["libstd/", "libcore/"], "libcore/char/methods.rs", &[(1, 0, 8),] ), t!( regression050, &["\x00\x00\x01", "\x00\x00\x00"], "\x00\x00\x00", &[(1, 0, 3),] ), t!( regression060, &["\x00\x00\x00", "\x00\x00\x01"], "\x00\x00\x00", &[(0, 0, 3),] ), ]; const TEDDY: &'static [SearchTest] = &[ t!( teddy010, &["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], "abcdefghijk", &[ (0, 0, 1), (1, 1, 2), (2, 2, 3), (3, 3, 4), (4, 4, 5), (5, 5, 6), (6, 6, 7), (7, 7, 8), (8, 8, 9), (9, 9, 10), (10, 10, 11) ] ), t!( teddy020, &["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"], "abcdefghijk", &[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),] ), t!( teddy030, &["abc"], "abcdefghijklmnopqrstuvwxyzabcdefghijk", &[(0, 0, 3), (0, 26, 29)] ), ]; // Now define a test for each combination of things above that we want to run. // Since there are a few different combinations for each collection of tests, // we define a couple of macros to avoid repetition drudgery. The testconfig // macro constructs the automaton from a given match kind, and runs the search // tests one-by-one over the given collection. The `with` parameter allows one // to configure the config with additional parameters. The testcombo macro // invokes testconfig in precisely this way: it sets up several tests where // each one turns a different knob on Config. macro_rules! testconfig { ($name:ident, $collection:expr, $with:expr) => { #[test] fn $name() { run_search_tests($collection, |test| { let mut config = Config::new(); $with(&mut config); let mut builder = config.builder(); builder.extend(test.patterns.iter().map(|p| p.as_bytes())); let searcher = match builder.build() { Some(searcher) => searcher, None => { // For x86-64 and aarch64, not building a searcher is // probably a bug, so be loud. if cfg!(any( target_arch = "x86_64", target_arch = "aarch64" )) { panic!("failed to build packed searcher") } return None; } }; Some(searcher.find_iter(&test.haystack).collect()) }); } }; } testconfig!( search_default_leftmost_first, PACKED_LEFTMOST_FIRST, |_: &mut Config| {} ); testconfig!( search_default_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.match_kind(MatchKind::LeftmostLongest); } ); testconfig!( search_teddy_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.only_teddy(true); } ); testconfig!( search_teddy_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.only_teddy(true).match_kind(MatchKind::LeftmostLongest); } ); testconfig!( search_teddy_ssse3_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.only_teddy(true); #[cfg(target_arch = "x86_64")] if std::is_x86_feature_detected!("ssse3") { c.only_teddy_256bit(Some(false)); } } ); testconfig!( search_teddy_ssse3_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.only_teddy(true).match_kind(MatchKind::LeftmostLongest); #[cfg(target_arch = "x86_64")] if std::is_x86_feature_detected!("ssse3") { c.only_teddy_256bit(Some(false)); } } ); testconfig!( search_teddy_avx2_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.only_teddy(true); #[cfg(target_arch = "x86_64")] if std::is_x86_feature_detected!("avx2") { c.only_teddy_256bit(Some(true)); } } ); testconfig!( search_teddy_avx2_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.only_teddy(true).match_kind(MatchKind::LeftmostLongest); #[cfg(target_arch = "x86_64")] if std::is_x86_feature_detected!("avx2") { c.only_teddy_256bit(Some(true)); } } ); testconfig!( search_teddy_fat_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.only_teddy(true); #[cfg(target_arch = "x86_64")] if std::is_x86_feature_detected!("avx2") { c.only_teddy_fat(Some(true)); } } ); testconfig!( search_teddy_fat_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.only_teddy(true).match_kind(MatchKind::LeftmostLongest); #[cfg(target_arch = "x86_64")] if std::is_x86_feature_detected!("avx2") { c.only_teddy_fat(Some(true)); } } ); testconfig!( search_rabinkarp_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.only_rabin_karp(true); } ); testconfig!( search_rabinkarp_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.only_rabin_karp(true).match_kind(MatchKind::LeftmostLongest); } ); #[test] fn search_tests_have_unique_names() { let assert = |constname, tests: &[SearchTest]| { let mut seen = HashMap::new(); // map from test name to position for (i, test) in tests.iter().enumerate() { if !seen.contains_key(test.name) { seen.insert(test.name, i); } else { let last = seen[test.name]; panic!( "{} tests have duplicate names at positions {} and {}", constname, last, i ); } } }; assert("BASICS", BASICS); assert("LEFTMOST", LEFTMOST); assert("LEFTMOST_FIRST", LEFTMOST_FIRST); assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); assert("REGRESSION", REGRESSION); assert("TEDDY", TEDDY); } fn run_search_tests Option>>( which: TestCollection, mut f: F, ) { let get_match_triples = |matches: Vec| -> Vec<(usize, usize, usize)> { matches .into_iter() .map(|m| (m.pattern().as_usize(), m.start(), m.end())) .collect() }; for &tests in which { for spec in tests { for test in spec.variations() { let results = match f(&test) { None => continue, Some(results) => results, }; assert_eq!( test.matches, get_match_triples(results).as_slice(), "test: {}, patterns: {:?}, haystack(len={:?}): {:?}, \ offset: {:?}", test.name, test.patterns, test.haystack.len(), test.haystack, test.offset, ); } } } } aho-corasick-1.1.3/src/packed/vector.rs000064400000000000000000001634751046102023000160770ustar 00000000000000// NOTE: The descriptions for each of the vector methods on the traits below // are pretty inscrutable. For this reason, there are tests for every method // on for every trait impl below. If you're confused about what an op does, // consult its test. (They probably should be doc tests, but I couldn't figure // out how to write them in a non-annoying way.) use core::{ fmt::Debug, panic::{RefUnwindSafe, UnwindSafe}, }; /// A trait for describing vector operations used by vectorized searchers. /// /// The trait is highly constrained to low level vector operations needed for /// the specific algorithms used in this crate. In general, it was invented /// mostly to be generic over x86's __m128i and __m256i types. At time of /// writing, it also supports wasm and aarch64 128-bit vector types as well. /// /// # Safety /// /// All methods are not safe since they are intended to be implemented using /// vendor intrinsics, which are also not safe. Callers must ensure that /// the appropriate target features are enabled in the calling function, /// and that the current CPU supports them. All implementations should /// avoid marking the routines with `#[target_feature]` and instead mark /// them as `#[inline(always)]` to ensure they get appropriately inlined. /// (`inline(always)` cannot be used with target_feature.) pub(crate) trait Vector: Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe { /// The number of bits in the vector. const BITS: usize; /// The number of bytes in the vector. That is, this is the size of the /// vector in memory. const BYTES: usize; /// Create a vector with 8-bit lanes with the given byte repeated into each /// lane. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn splat(byte: u8) -> Self; /// Read a vector-size number of bytes from the given pointer. The pointer /// does not need to be aligned. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. /// /// Callers must guarantee that at least `BYTES` bytes are readable from /// `data`. unsafe fn load_unaligned(data: *const u8) -> Self; /// Returns true if and only if this vector has zero in all of its lanes. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn is_zero(self) -> bool; /// Do an 8-bit pairwise equality check. If lane `i` is equal in this /// vector and the one given, then lane `i` in the resulting vector is set /// to `0xFF`. Otherwise, it is set to `0x00`. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn cmpeq(self, vector2: Self) -> Self; /// Perform a bitwise 'and' of this vector and the one given and return /// the result. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn and(self, vector2: Self) -> Self; /// Perform a bitwise 'or' of this vector and the one given and return /// the result. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. #[allow(dead_code)] // unused, but useful enough to keep around? unsafe fn or(self, vector2: Self) -> Self; /// Shift each 8-bit lane in this vector to the right by the number of /// bits indictated by the `BITS` type parameter. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn shift_8bit_lane_right(self) -> Self; /// Shift this vector to the left by one byte and shift the most /// significant byte of `vector2` into the least significant position of /// this vector. /// /// Stated differently, this behaves as if `self` and `vector2` were /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted /// right by `Self::BYTES - 1` bytes. /// /// With respect to the Teddy algorithm, `vector2` is usually a previous /// `Self::BYTES` chunk from the haystack and `self` is the chunk /// immediately following it. This permits combining the last two bytes /// from the previous chunk (`vector2`) with the first `Self::BYTES - 1` /// bytes from the current chunk. This permits aligning the result of /// various shuffles so that they can be and-ed together and a possible /// candidate discovered. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn shift_in_one_byte(self, vector2: Self) -> Self; /// Shift this vector to the left by two bytes and shift the two most /// significant bytes of `vector2` into the least significant position of /// this vector. /// /// Stated differently, this behaves as if `self` and `vector2` were /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted /// right by `Self::BYTES - 2` bytes. /// /// With respect to the Teddy algorithm, `vector2` is usually a previous /// `Self::BYTES` chunk from the haystack and `self` is the chunk /// immediately following it. This permits combining the last two bytes /// from the previous chunk (`vector2`) with the first `Self::BYTES - 2` /// bytes from the current chunk. This permits aligning the result of /// various shuffles so that they can be and-ed together and a possible /// candidate discovered. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self; /// Shift this vector to the left by three bytes and shift the three most /// significant bytes of `vector2` into the least significant position of /// this vector. /// /// Stated differently, this behaves as if `self` and `vector2` were /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted /// right by `Self::BYTES - 3` bytes. /// /// With respect to the Teddy algorithm, `vector2` is usually a previous /// `Self::BYTES` chunk from the haystack and `self` is the chunk /// immediately following it. This permits combining the last three bytes /// from the previous chunk (`vector2`) with the first `Self::BYTES - 3` /// bytes from the current chunk. This permits aligning the result of /// various shuffles so that they can be and-ed together and a possible /// candidate discovered. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self; /// Shuffles the bytes in this vector according to the indices in each of /// the corresponding lanes in `indices`. /// /// If `i` is the index of corresponding lanes, `A` is this vector, `B` is /// indices and `C` is the resulting vector, then `C = A[B[i]]`. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn shuffle_bytes(self, indices: Self) -> Self; /// Call the provided function for each 64-bit lane in this vector. The /// given function is provided the lane index and lane value as a `u64`. /// /// If `f` returns `Some`, then iteration over the lanes is stopped and the /// value is returned. Otherwise, this returns `None`. /// /// # Notes /// /// Conceptually it would be nice if we could have a /// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is /// tricky given Rust's [current support for const generics][support]. /// And even if we could, it would be tricky to write generic code over /// it. (Not impossible. We could introduce another layer that requires /// `AsRef<[u64]>` or something.) /// /// [support]: https://github.com/rust-lang/rust/issues/60551 /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn for_each_64bit_lane( self, f: impl FnMut(usize, u64) -> Option, ) -> Option; } /// This trait extends the `Vector` trait with additional operations to support /// Fat Teddy. /// /// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as /// the vector size) instead of the full size of a vector per iteration. For /// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr /// but Fat Teddy reads 16 bytes at a time. /// /// Fat Teddy is useful when searching for a large number of literals. /// The extra number of buckets spreads the literals out more and reduces /// verification time. /// /// Currently we only implement this for AVX on x86_64. It would be nice to /// implement this for SSE on x86_64 and NEON on aarch64, with the latter two /// only reading 8 bytes at a time. It's not clear how well it would work, but /// there are some tricky things to figure out in terms of implementation. The /// `half_shift_in_{one,two,three}_bytes` methods in particular are probably /// the trickiest of the bunch. For AVX2, these are implemented by taking /// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit /// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8` /// operates on the full 128-bit vector and not on each 64-bit half.) I didn't /// do a careful survey of NEON to see if it could easily support these /// operations. pub(crate) trait FatVector: Vector { type Half: Vector; /// Read a half-vector-size number of bytes from the given pointer, and /// broadcast it across both halfs of a full vector. The pointer does not /// need to be aligned. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. /// /// Callers must guarantee that at least `Self::HALF::BYTES` bytes are /// readable from `data`. unsafe fn load_half_unaligned(data: *const u8) -> Self; /// Like `Vector::shift_in_one_byte`, except this is done for each half /// of the vector instead. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self; /// Like `Vector::shift_in_two_bytes`, except this is done for each half /// of the vector instead. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self; /// Like `Vector::shift_in_two_bytes`, except this is done for each half /// of the vector instead. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self; /// Swap the 128-bit lanes in this vector. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn swap_halves(self) -> Self; /// Unpack and interleave the 8-bit lanes from the low 128 bits of each /// vector and return the result. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self; /// Unpack and interleave the 8-bit lanes from the high 128 bits of each /// vector and return the result. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self; /// Call the provided function for each 64-bit lane in the lower half /// of this vector and then in the other vector. The given function is /// provided the lane index and lane value as a `u64`. (The high 128-bits /// of each vector are ignored.) /// /// If `f` returns `Some`, then iteration over the lanes is stopped and the /// value is returned. Otherwise, this returns `None`. /// /// # Safety /// /// Callers must ensure that this is okay to call in the current target for /// the current CPU. unsafe fn for_each_low_64bit_lane( self, vector2: Self, f: impl FnMut(usize, u64) -> Option, ) -> Option; } #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] mod x86_64_ssse3 { use core::arch::x86_64::*; use crate::util::int::{I32, I8}; use super::Vector; impl Vector for __m128i { const BITS: usize = 128; const BYTES: usize = 16; #[inline(always)] unsafe fn splat(byte: u8) -> __m128i { _mm_set1_epi8(i8::from_bits(byte)) } #[inline(always)] unsafe fn load_unaligned(data: *const u8) -> __m128i { _mm_loadu_si128(data.cast::<__m128i>()) } #[inline(always)] unsafe fn is_zero(self) -> bool { let cmp = self.cmpeq(Self::splat(0)); _mm_movemask_epi8(cmp).to_bits() == 0xFFFF } #[inline(always)] unsafe fn cmpeq(self, vector2: Self) -> __m128i { _mm_cmpeq_epi8(self, vector2) } #[inline(always)] unsafe fn and(self, vector2: Self) -> __m128i { _mm_and_si128(self, vector2) } #[inline(always)] unsafe fn or(self, vector2: Self) -> __m128i { _mm_or_si128(self, vector2) } #[inline(always)] unsafe fn shift_8bit_lane_right(self) -> Self { // Apparently there is no _mm_srli_epi8, so we emulate it by // shifting 16-bit integers and masking out the high nybble of each // 8-bit lane (since that nybble will contain bits from the low // nybble of the previous lane). let lomask = Self::splat(0xF); _mm_srli_epi16(self, BITS).and(lomask) } #[inline(always)] unsafe fn shift_in_one_byte(self, vector2: Self) -> Self { _mm_alignr_epi8(self, vector2, 15) } #[inline(always)] unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self { _mm_alignr_epi8(self, vector2, 14) } #[inline(always)] unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self { _mm_alignr_epi8(self, vector2, 13) } #[inline(always)] unsafe fn shuffle_bytes(self, indices: Self) -> Self { _mm_shuffle_epi8(self, indices) } #[inline(always)] unsafe fn for_each_64bit_lane( self, mut f: impl FnMut(usize, u64) -> Option, ) -> Option { // We could just use _mm_extract_epi64 here, but that requires // SSE 4.1. It isn't necessarily a problem to just require SSE 4.1, // but everything else works with SSSE3 so we stick to that subset. let lanes: [u64; 2] = core::mem::transmute(self); if let Some(t) = f(0, lanes[0]) { return Some(t); } if let Some(t) = f(1, lanes[1]) { return Some(t); } None } } } #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] mod x86_64_avx2 { use core::arch::x86_64::*; use crate::util::int::{I32, I64, I8}; use super::{FatVector, Vector}; impl Vector for __m256i { const BITS: usize = 256; const BYTES: usize = 32; #[inline(always)] unsafe fn splat(byte: u8) -> __m256i { _mm256_set1_epi8(i8::from_bits(byte)) } #[inline(always)] unsafe fn load_unaligned(data: *const u8) -> __m256i { _mm256_loadu_si256(data.cast::<__m256i>()) } #[inline(always)] unsafe fn is_zero(self) -> bool { let cmp = self.cmpeq(Self::splat(0)); _mm256_movemask_epi8(cmp).to_bits() == 0xFFFFFFFF } #[inline(always)] unsafe fn cmpeq(self, vector2: Self) -> __m256i { _mm256_cmpeq_epi8(self, vector2) } #[inline(always)] unsafe fn and(self, vector2: Self) -> __m256i { _mm256_and_si256(self, vector2) } #[inline(always)] unsafe fn or(self, vector2: Self) -> __m256i { _mm256_or_si256(self, vector2) } #[inline(always)] unsafe fn shift_8bit_lane_right(self) -> Self { let lomask = Self::splat(0xF); _mm256_srli_epi16(self, BITS).and(lomask) } #[inline(always)] unsafe fn shift_in_one_byte(self, vector2: Self) -> Self { // Credit goes to jneem for figuring this out: // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 // // TL;DR avx2's PALIGNR instruction is actually just two 128-bit // PALIGNR instructions, which is not what we want, so we need to // do some extra shuffling. let v = _mm256_permute2x128_si256(vector2, self, 0x21); _mm256_alignr_epi8(self, v, 15) } #[inline(always)] unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self { // Credit goes to jneem for figuring this out: // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 // // TL;DR avx2's PALIGNR instruction is actually just two 128-bit // PALIGNR instructions, which is not what we want, so we need to // do some extra shuffling. let v = _mm256_permute2x128_si256(vector2, self, 0x21); _mm256_alignr_epi8(self, v, 14) } #[inline(always)] unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self { // Credit goes to jneem for figuring this out: // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 // // TL;DR avx2's PALIGNR instruction is actually just two 128-bit // PALIGNR instructions, which is not what we want, so we need to // do some extra shuffling. let v = _mm256_permute2x128_si256(vector2, self, 0x21); _mm256_alignr_epi8(self, v, 13) } #[inline(always)] unsafe fn shuffle_bytes(self, indices: Self) -> Self { _mm256_shuffle_epi8(self, indices) } #[inline(always)] unsafe fn for_each_64bit_lane( self, mut f: impl FnMut(usize, u64) -> Option, ) -> Option { // NOTE: At one point in the past, I used transmute to this to // get a [u64; 4], but it turned out to lead to worse codegen IIRC. // I've tried it more recently, and it looks like that's no longer // the case. But since there's no difference, we stick with the // slightly more complicated but transmute-free version. let lane = _mm256_extract_epi64(self, 0).to_bits(); if let Some(t) = f(0, lane) { return Some(t); } let lane = _mm256_extract_epi64(self, 1).to_bits(); if let Some(t) = f(1, lane) { return Some(t); } let lane = _mm256_extract_epi64(self, 2).to_bits(); if let Some(t) = f(2, lane) { return Some(t); } let lane = _mm256_extract_epi64(self, 3).to_bits(); if let Some(t) = f(3, lane) { return Some(t); } None } } impl FatVector for __m256i { type Half = __m128i; #[inline(always)] unsafe fn load_half_unaligned(data: *const u8) -> Self { let half = Self::Half::load_unaligned(data); _mm256_broadcastsi128_si256(half) } #[inline(always)] unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self { _mm256_alignr_epi8(self, vector2, 15) } #[inline(always)] unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self { _mm256_alignr_epi8(self, vector2, 14) } #[inline(always)] unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self { _mm256_alignr_epi8(self, vector2, 13) } #[inline(always)] unsafe fn swap_halves(self) -> Self { _mm256_permute4x64_epi64(self, 0x4E) } #[inline(always)] unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self { _mm256_unpacklo_epi8(self, vector2) } #[inline(always)] unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self { _mm256_unpackhi_epi8(self, vector2) } #[inline(always)] unsafe fn for_each_low_64bit_lane( self, vector2: Self, mut f: impl FnMut(usize, u64) -> Option, ) -> Option { let lane = _mm256_extract_epi64(self, 0).to_bits(); if let Some(t) = f(0, lane) { return Some(t); } let lane = _mm256_extract_epi64(self, 1).to_bits(); if let Some(t) = f(1, lane) { return Some(t); } let lane = _mm256_extract_epi64(vector2, 0).to_bits(); if let Some(t) = f(2, lane) { return Some(t); } let lane = _mm256_extract_epi64(vector2, 1).to_bits(); if let Some(t) = f(3, lane) { return Some(t); } None } } } #[cfg(all( target_arch = "aarch64", target_feature = "neon", target_endian = "little" ))] mod aarch64_neon { use core::arch::aarch64::*; use super::Vector; impl Vector for uint8x16_t { const BITS: usize = 128; const BYTES: usize = 16; #[inline(always)] unsafe fn splat(byte: u8) -> uint8x16_t { vdupq_n_u8(byte) } #[inline(always)] unsafe fn load_unaligned(data: *const u8) -> uint8x16_t { vld1q_u8(data) } #[inline(always)] unsafe fn is_zero(self) -> bool { // Could also use vmaxvq_u8. // ... I tried that and couldn't observe any meaningful difference // in benchmarks. let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self)); vgetq_lane_u64(maxes, 0) == 0 } #[inline(always)] unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t { vceqq_u8(self, vector2) } #[inline(always)] unsafe fn and(self, vector2: Self) -> uint8x16_t { vandq_u8(self, vector2) } #[inline(always)] unsafe fn or(self, vector2: Self) -> uint8x16_t { vorrq_u8(self, vector2) } #[inline(always)] unsafe fn shift_8bit_lane_right(self) -> Self { debug_assert!(BITS <= 7); vshrq_n_u8(self, BITS) } #[inline(always)] unsafe fn shift_in_one_byte(self, vector2: Self) -> Self { vextq_u8(vector2, self, 15) } #[inline(always)] unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self { vextq_u8(vector2, self, 14) } #[inline(always)] unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self { vextq_u8(vector2, self, 13) } #[inline(always)] unsafe fn shuffle_bytes(self, indices: Self) -> Self { vqtbl1q_u8(self, indices) } #[inline(always)] unsafe fn for_each_64bit_lane( self, mut f: impl FnMut(usize, u64) -> Option, ) -> Option { let this = vreinterpretq_u64_u8(self); let lane = vgetq_lane_u64(this, 0); if let Some(t) = f(0, lane) { return Some(t); } let lane = vgetq_lane_u64(this, 1); if let Some(t) = f(1, lane) { return Some(t); } None } } } #[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))] mod tests_x86_64_ssse3 { use core::arch::x86_64::*; use crate::util::int::{I32, U32}; use super::*; fn is_runnable() -> bool { std::is_x86_feature_detected!("ssse3") } #[target_feature(enable = "ssse3")] unsafe fn load(lanes: [u8; 16]) -> __m128i { __m128i::load_unaligned(&lanes as *const u8) } #[target_feature(enable = "ssse3")] unsafe fn unload(v: __m128i) -> [u8; 16] { [ _mm_extract_epi8(v, 0).to_bits().low_u8(), _mm_extract_epi8(v, 1).to_bits().low_u8(), _mm_extract_epi8(v, 2).to_bits().low_u8(), _mm_extract_epi8(v, 3).to_bits().low_u8(), _mm_extract_epi8(v, 4).to_bits().low_u8(), _mm_extract_epi8(v, 5).to_bits().low_u8(), _mm_extract_epi8(v, 6).to_bits().low_u8(), _mm_extract_epi8(v, 7).to_bits().low_u8(), _mm_extract_epi8(v, 8).to_bits().low_u8(), _mm_extract_epi8(v, 9).to_bits().low_u8(), _mm_extract_epi8(v, 10).to_bits().low_u8(), _mm_extract_epi8(v, 11).to_bits().low_u8(), _mm_extract_epi8(v, 12).to_bits().low_u8(), _mm_extract_epi8(v, 13).to_bits().low_u8(), _mm_extract_epi8(v, 14).to_bits().low_u8(), _mm_extract_epi8(v, 15).to_bits().low_u8(), ] } #[test] fn vector_splat() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v = __m128i::splat(0xAF); assert_eq!( unload(v), [ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF ] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_is_zero() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert!(!v.is_zero()); let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert!(v.is_zero()); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_cmpeq() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]); let v2 = load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]); assert_eq!( unload(v1.cmpeq(v2)), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_and() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v1 = load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let v2 = load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert_eq!( unload(v1.and(v2)), [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_or() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v1 = load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let v2 = load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert_eq!( unload(v1.or(v2)), [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shift_8bit_lane_right() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v = load([ 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert_eq!( unload(v.shift_8bit_lane_right::<2>()), [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shift_in_one_byte() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let v2 = load([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v1.shift_in_one_byte(v2)), [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shift_in_two_bytes() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let v2 = load([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v1.shift_in_two_bytes(v2)), [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shift_in_three_bytes() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let v2 = load([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v1.shift_in_three_bytes(v2)), [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shuffle_bytes() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let v2 = load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]); assert_eq!( unload(v1.shuffle_bytes(v2)), [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_for_each_64bit_lane() { #[target_feature(enable = "ssse3")] unsafe fn test() { let v = load([ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, ]); let mut lanes = [0u64; 2]; v.for_each_64bit_lane(|i, lane| { lanes[i] = lane; None::<()> }); assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],); } if !is_runnable() { return; } unsafe { test() } } } #[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))] mod tests_x86_64_avx2 { use core::arch::x86_64::*; use crate::util::int::{I32, U32}; use super::*; fn is_runnable() -> bool { std::is_x86_feature_detected!("avx2") } #[target_feature(enable = "avx2")] unsafe fn load(lanes: [u8; 32]) -> __m256i { __m256i::load_unaligned(&lanes as *const u8) } #[target_feature(enable = "avx2")] unsafe fn load_half(lanes: [u8; 16]) -> __m256i { __m256i::load_half_unaligned(&lanes as *const u8) } #[target_feature(enable = "avx2")] unsafe fn unload(v: __m256i) -> [u8; 32] { [ _mm256_extract_epi8(v, 0).to_bits().low_u8(), _mm256_extract_epi8(v, 1).to_bits().low_u8(), _mm256_extract_epi8(v, 2).to_bits().low_u8(), _mm256_extract_epi8(v, 3).to_bits().low_u8(), _mm256_extract_epi8(v, 4).to_bits().low_u8(), _mm256_extract_epi8(v, 5).to_bits().low_u8(), _mm256_extract_epi8(v, 6).to_bits().low_u8(), _mm256_extract_epi8(v, 7).to_bits().low_u8(), _mm256_extract_epi8(v, 8).to_bits().low_u8(), _mm256_extract_epi8(v, 9).to_bits().low_u8(), _mm256_extract_epi8(v, 10).to_bits().low_u8(), _mm256_extract_epi8(v, 11).to_bits().low_u8(), _mm256_extract_epi8(v, 12).to_bits().low_u8(), _mm256_extract_epi8(v, 13).to_bits().low_u8(), _mm256_extract_epi8(v, 14).to_bits().low_u8(), _mm256_extract_epi8(v, 15).to_bits().low_u8(), _mm256_extract_epi8(v, 16).to_bits().low_u8(), _mm256_extract_epi8(v, 17).to_bits().low_u8(), _mm256_extract_epi8(v, 18).to_bits().low_u8(), _mm256_extract_epi8(v, 19).to_bits().low_u8(), _mm256_extract_epi8(v, 20).to_bits().low_u8(), _mm256_extract_epi8(v, 21).to_bits().low_u8(), _mm256_extract_epi8(v, 22).to_bits().low_u8(), _mm256_extract_epi8(v, 23).to_bits().low_u8(), _mm256_extract_epi8(v, 24).to_bits().low_u8(), _mm256_extract_epi8(v, 25).to_bits().low_u8(), _mm256_extract_epi8(v, 26).to_bits().low_u8(), _mm256_extract_epi8(v, 27).to_bits().low_u8(), _mm256_extract_epi8(v, 28).to_bits().low_u8(), _mm256_extract_epi8(v, 29).to_bits().low_u8(), _mm256_extract_epi8(v, 30).to_bits().low_u8(), _mm256_extract_epi8(v, 31).to_bits().low_u8(), ] } #[test] fn vector_splat() { #[target_feature(enable = "avx2")] unsafe fn test() { let v = __m256i::splat(0xAF); assert_eq!( unload(v), [ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, ] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_is_zero() { #[target_feature(enable = "avx2")] unsafe fn test() { let v = load([ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert!(!v.is_zero()); let v = load([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert!(v.is_zero()); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_cmpeq() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, ]); let v2 = load([ 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, ]); assert_eq!( unload(v1.cmpeq(v2)), [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF ] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_and() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); let v2 = load([ 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert_eq!( unload(v1.and(v2)), [ 0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_or() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); let v2 = load([ 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert_eq!( unload(v1.or(v2)), [ 0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shift_8bit_lane_right() { #[target_feature(enable = "avx2")] unsafe fn test() { let v = load([ 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert_eq!( unload(v.shift_8bit_lane_right::<2>()), [ 0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shift_in_one_byte() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); let v2 = load([ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, ]); assert_eq!( unload(v1.shift_in_one_byte(v2)), [ 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shift_in_two_bytes() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); let v2 = load([ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, ]); assert_eq!( unload(v1.shift_in_two_bytes(v2)), [ 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shift_in_three_bytes() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); let v2 = load([ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, ]); assert_eq!( unload(v1.shift_in_three_bytes(v2)), [ 62, 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_shuffle_bytes() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); let v2 = load([ 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 16, 16, 20, 20, 20, 20, 24, 24, 24, 24, 28, 28, 28, 28, ]); assert_eq!( unload(v1.shuffle_bytes(v2)), [ 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13, 17, 17, 17, 17, 21, 21, 21, 21, 25, 25, 25, 25, 29, 29, 29, 29 ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn vector_for_each_64bit_lane() { #[target_feature(enable = "avx2")] unsafe fn test() { let v = load([ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, ]); let mut lanes = [0u64; 4]; v.for_each_64bit_lane(|i, lane| { lanes[i] = lane; None::<()> }); assert_eq!( lanes, [ 0x0807060504030201, 0x100F0E0D0C0B0A09, 0x1817161514131211, 0x201F1E1D1C1B1A19 ] ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn fat_vector_half_shift_in_one_byte() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load_half([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ]); let v2 = load_half([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v1.half_shift_in_one_byte(v2)), [ 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn fat_vector_half_shift_in_two_bytes() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load_half([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ]); let v2 = load_half([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v1.half_shift_in_two_bytes(v2)), [ 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn fat_vector_half_shift_in_three_bytes() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load_half([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ]); let v2 = load_half([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v1.half_shift_in_three_bytes(v2)), [ 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn fat_vector_swap_halves() { #[target_feature(enable = "avx2")] unsafe fn test() { let v = load([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v.swap_halves()), [ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn fat_vector_interleave_low_8bit_lanes() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); let v2 = load([ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, ]); assert_eq!( unload(v1.interleave_low_8bit_lanes(v2)), [ 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn fat_vector_interleave_high_8bit_lanes() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); let v2 = load([ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, ]); assert_eq!( unload(v1.interleave_high_8bit_lanes(v2)), [ 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16, 48, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, 32, 64, ], ); } if !is_runnable() { return; } unsafe { test() } } #[test] fn fat_vector_for_each_low_64bit_lane() { #[target_feature(enable = "avx2")] unsafe fn test() { let v1 = load([ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, ]); let v2 = load([ 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 0x3F, 0x40, ]); let mut lanes = [0u64; 4]; v1.for_each_low_64bit_lane(v2, |i, lane| { lanes[i] = lane; None::<()> }); assert_eq!( lanes, [ 0x0807060504030201, 0x100F0E0D0C0B0A09, 0x2827262524232221, 0x302F2E2D2C2B2A29 ] ); } if !is_runnable() { return; } unsafe { test() } } } #[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))] mod tests_aarch64_neon { use core::arch::aarch64::*; use super::*; #[target_feature(enable = "neon")] unsafe fn load(lanes: [u8; 16]) -> uint8x16_t { uint8x16_t::load_unaligned(&lanes as *const u8) } #[target_feature(enable = "neon")] unsafe fn unload(v: uint8x16_t) -> [u8; 16] { [ vgetq_lane_u8(v, 0), vgetq_lane_u8(v, 1), vgetq_lane_u8(v, 2), vgetq_lane_u8(v, 3), vgetq_lane_u8(v, 4), vgetq_lane_u8(v, 5), vgetq_lane_u8(v, 6), vgetq_lane_u8(v, 7), vgetq_lane_u8(v, 8), vgetq_lane_u8(v, 9), vgetq_lane_u8(v, 10), vgetq_lane_u8(v, 11), vgetq_lane_u8(v, 12), vgetq_lane_u8(v, 13), vgetq_lane_u8(v, 14), vgetq_lane_u8(v, 15), ] } // Example functions. These don't test the Vector traits, but rather, // specific NEON instructions. They are basically little experiments I // wrote to figure out what an instruction does since their descriptions // are so dense. I decided to keep the experiments around as example tests // in case there' useful. #[test] fn example_vmaxvq_u8_non_zero() { #[target_feature(enable = "neon")] unsafe fn example() { let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert_eq!(vmaxvq_u8(v), 1); } unsafe { example() } } #[test] fn example_vmaxvq_u8_zero() { #[target_feature(enable = "neon")] unsafe fn example() { let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert_eq!(vmaxvq_u8(v), 0); } unsafe { example() } } #[test] fn example_vpmaxq_u8_non_zero() { #[target_feature(enable = "neon")] unsafe fn example() { let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let r = vpmaxq_u8(v, v); assert_eq!( unload(r), [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] ); } unsafe { example() } } #[test] fn example_vpmaxq_u8_self() { #[target_feature(enable = "neon")] unsafe fn example() { let v = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let r = vpmaxq_u8(v, v); assert_eq!( unload(r), [2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16] ); } unsafe { example() } } #[test] fn example_vpmaxq_u8_other() { #[target_feature(enable = "neon")] unsafe fn example() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let v2 = load([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); let r = vpmaxq_u8(v1, v2); assert_eq!( unload(r), [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32] ); } unsafe { example() } } // Now we test the actual methods on the Vector trait. #[test] fn vector_splat() { #[target_feature(enable = "neon")] unsafe fn test() { let v = uint8x16_t::splat(0xAF); assert_eq!( unload(v), [ 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF ] ); } unsafe { test() } } #[test] fn vector_is_zero() { #[target_feature(enable = "neon")] unsafe fn test() { let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert!(!v.is_zero()); let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert!(v.is_zero()); } unsafe { test() } } #[test] fn vector_cmpeq() { #[target_feature(enable = "neon")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]); let v2 = load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]); assert_eq!( unload(v1.cmpeq(v2)), [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF] ); } unsafe { test() } } #[test] fn vector_and() { #[target_feature(enable = "neon")] unsafe fn test() { let v1 = load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let v2 = load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert_eq!( unload(v1.and(v2)), [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ); } unsafe { test() } } #[test] fn vector_or() { #[target_feature(enable = "neon")] unsafe fn test() { let v1 = load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let v2 = load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert_eq!( unload(v1.or(v2)), [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ); } unsafe { test() } } #[test] fn vector_shift_8bit_lane_right() { #[target_feature(enable = "neon")] unsafe fn test() { let v = load([ 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert_eq!( unload(v.shift_8bit_lane_right::<2>()), [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ); } unsafe { test() } } #[test] fn vector_shift_in_one_byte() { #[target_feature(enable = "neon")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let v2 = load([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v1.shift_in_one_byte(v2)), [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], ); } unsafe { test() } } #[test] fn vector_shift_in_two_bytes() { #[target_feature(enable = "neon")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let v2 = load([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v1.shift_in_two_bytes(v2)), [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], ); } unsafe { test() } } #[test] fn vector_shift_in_three_bytes() { #[target_feature(enable = "neon")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let v2 = load([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); assert_eq!( unload(v1.shift_in_three_bytes(v2)), [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], ); } unsafe { test() } } #[test] fn vector_shuffle_bytes() { #[target_feature(enable = "neon")] unsafe fn test() { let v1 = load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let v2 = load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]); assert_eq!( unload(v1.shuffle_bytes(v2)), [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13], ); } unsafe { test() } } #[test] fn vector_for_each_64bit_lane() { #[target_feature(enable = "neon")] unsafe fn test() { let v = load([ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, ]); let mut lanes = [0u64; 2]; v.for_each_64bit_lane(|i, lane| { lanes[i] = lane; None::<()> }); assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],); } unsafe { test() } } } aho-corasick-1.1.3/src/tests.rs000064400000000000000000001571501046102023000145010ustar 00000000000000use std::{collections::HashMap, format, string::String, vec::Vec}; use crate::{ AhoCorasick, AhoCorasickBuilder, AhoCorasickKind, Anchored, Input, Match, MatchKind, StartKind, }; /// A description of a single test against an Aho-Corasick automaton. /// /// A single test may not necessarily pass on every configuration of an /// Aho-Corasick automaton. The tests are categorized and grouped appropriately /// below. #[derive(Clone, Debug, Eq, PartialEq)] struct SearchTest { /// The name of this test, for debugging. name: &'static str, /// The patterns to search for. patterns: &'static [&'static str], /// The text to search. haystack: &'static str, /// Each match is a triple of (pattern_index, start, end), where /// pattern_index is an index into `patterns` and `start`/`end` are indices /// into `haystack`. matches: &'static [(usize, usize, usize)], } /// Short-hand constructor for SearchTest. We use it a lot below. macro_rules! t { ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { SearchTest { name: stringify!($name), patterns: $patterns, haystack: $haystack, matches: $matches, } }; } /// A collection of test groups. type TestCollection = &'static [&'static [SearchTest]]; // Define several collections corresponding to the different type of match // semantics supported by Aho-Corasick. These collections have some overlap, // but each collection should have some tests that no other collection has. /// Tests for Aho-Corasick's standard non-overlapping match semantics. const AC_STANDARD_NON_OVERLAPPING: TestCollection = &[BASICS, NON_OVERLAPPING, STANDARD, REGRESSION]; /// Tests for Aho-Corasick's anchored standard non-overlapping match semantics. const AC_STANDARD_ANCHORED_NON_OVERLAPPING: TestCollection = &[ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, STANDARD_ANCHORED]; /// Tests for Aho-Corasick's standard overlapping match semantics. const AC_STANDARD_OVERLAPPING: TestCollection = &[BASICS, OVERLAPPING, REGRESSION]; /* Iterators of anchored overlapping searches were removed from the API in after 0.7, but we leave the tests commented out for posterity. /// Tests for Aho-Corasick's anchored standard overlapping match semantics. const AC_STANDARD_ANCHORED_OVERLAPPING: TestCollection = &[ANCHORED_BASICS, ANCHORED_OVERLAPPING]; */ /// Tests for Aho-Corasick's leftmost-first match semantics. const AC_LEFTMOST_FIRST: TestCollection = &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_FIRST, REGRESSION]; /// Tests for Aho-Corasick's anchored leftmost-first match semantics. const AC_LEFTMOST_FIRST_ANCHORED: TestCollection = &[ ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, ANCHORED_LEFTMOST, ANCHORED_LEFTMOST_FIRST, ]; /// Tests for Aho-Corasick's leftmost-longest match semantics. const AC_LEFTMOST_LONGEST: TestCollection = &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_LONGEST, REGRESSION]; /// Tests for Aho-Corasick's anchored leftmost-longest match semantics. const AC_LEFTMOST_LONGEST_ANCHORED: TestCollection = &[ ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, ANCHORED_LEFTMOST, ANCHORED_LEFTMOST_LONGEST, ]; // Now define the individual tests that make up the collections above. /// A collection of tests for the Aho-Corasick algorithm that should always be /// true regardless of match semantics. That is, all combinations of /// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} /// should produce the same answer. const BASICS: &'static [SearchTest] = &[ t!(basic000, &[], "", &[]), t!(basic001, &[""], "a", &[(0, 0, 0), (0, 1, 1)]), t!(basic002, &["a"], "", &[]), t!(basic010, &["a"], "a", &[(0, 0, 1)]), t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), t!(basic050, &["a"], "bba", &[(0, 2, 3)]), t!(basic060, &["a"], "bbb", &[]), t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), t!(basic100, &["aa"], "", &[]), t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), t!(basic130, &["aa"], "abbab", &[]), t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), t!(basic300, &["a", "b"], "", &[]), t!(basic310, &["a", "b"], "z", &[]), t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), t!( basic340, &["a", "b"], "abba", &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] ), t!( basic350, &["b", "a"], "abba", &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] ), t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), t!(basic400, &["foo", "bar"], "", &[]), t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), t!(basic600, &[""], "", &[(0, 0, 0)]), t!(basic610, &[""], "a", &[(0, 0, 0), (0, 1, 1)]), t!(basic620, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]), t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), t!( basic720, &["yabcdef", "bcdeyabc", "abcdezghi"], "yabcdezghi", &[(2, 1, 10),] ), ]; /// A collection of *anchored* tests for the Aho-Corasick algorithm that should /// always be true regardless of match semantics. That is, all combinations of /// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} should /// produce the same answer. const ANCHORED_BASICS: &'static [SearchTest] = &[ t!(abasic000, &[], "", &[]), t!(abasic001, &[], "a", &[]), t!(abasic002, &[], "abc", &[]), t!(abasic010, &[""], "", &[(0, 0, 0)]), t!(abasic020, &[""], "a", &[(0, 0, 0), (0, 1, 1)]), t!(abasic030, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]), t!(abasic100, &["a"], "a", &[(0, 0, 1)]), t!(abasic110, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(abasic120, &["a", "b"], "ab", &[(0, 0, 1), (1, 1, 2)]), t!(abasic130, &["a", "b"], "ba", &[(1, 0, 1), (0, 1, 2)]), t!(abasic140, &["foo", "foofoo"], "foo", &[(0, 0, 3)]), t!(abasic150, &["foofoo", "foo"], "foo", &[(1, 0, 3)]), t!(abasic200, &["foo"], "foofoo foo", &[(0, 0, 3), (0, 3, 6)]), ]; /// Tests for non-overlapping standard match semantics. /// /// These tests generally shouldn't pass for leftmost-{first,longest}, although /// some do in order to write clearer tests. For example, standard000 will /// pass with leftmost-first semantics, but standard010 will not. We write /// both to emphasize how the match semantics work. const STANDARD: &'static [SearchTest] = &[ t!(standard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(standard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), t!(standard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), t!(standard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), t!(standard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]), t!( standard400, &["abcd", "bcd", "cd", "b"], "abcd", &[(3, 1, 2), (2, 2, 4),] ), t!(standard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), t!(standard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2),]), t!(standard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), t!(standard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1),]), t!(standard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), ]; /// Like STANDARD, but for anchored searches. const STANDARD_ANCHORED: &'static [SearchTest] = &[ t!(astandard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(astandard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), t!(astandard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), t!(astandard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), t!(astandard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]), t!(astandard050, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), t!(astandard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), t!(astandard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2)]), t!(astandard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1)]), t!(astandard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1)]), t!(astandard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), ]; /// Tests for non-overlapping leftmost match semantics. These should pass for /// both leftmost-first and leftmost-longest match kinds. Stated differently, /// among ambiguous matches, the longest match and the match that appeared /// first when constructing the automaton should always be the same. const LEFTMOST: &'static [SearchTest] = &[ t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(leftmost010, &["a", ""], "a", &[(0, 0, 1)]), t!(leftmost011, &["a", ""], "ab", &[(0, 0, 1), (1, 2, 2)]), t!(leftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]), t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), t!( leftmost360, &["abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(2, 0, 8),] ), t!( leftmost370, &["abcdefghi", "cde", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost380, &["abcdefghi", "hz", "abcdefgh", "a"], "abcdefghz", &[(2, 0, 8),] ), t!( leftmost390, &["b", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost400, &["h", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost410, &["z", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8), (0, 8, 9),] ), ]; /// Like LEFTMOST, but for anchored searches. const ANCHORED_LEFTMOST: &'static [SearchTest] = &[ t!(aleftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), // We shouldn't allow an empty match immediately following a match, right? t!(aleftmost010, &["a", ""], "a", &[(0, 0, 1)]), t!(aleftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]), t!(aleftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(aleftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), t!(aleftmost032, &["ab", "a"], "xayabbbz", &[]), t!(aleftmost300, &["abcd", "bce", "b"], "abce", &[]), t!(aleftmost301, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), t!(aleftmost310, &["abcd", "ce", "bc"], "abce", &[]), t!(aleftmost320, &["abcd", "bce", "ce", "b"], "abce", &[]), t!(aleftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[]), t!(aleftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), t!(aleftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), t!( aleftmost360, &["abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(2, 0, 8),] ), t!( aleftmost370, &["abcdefghi", "cde", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( aleftmost380, &["abcdefghi", "hz", "abcdefgh", "a"], "abcdefghz", &[(2, 0, 8),] ), t!( aleftmost390, &["b", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( aleftmost400, &["h", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( aleftmost410, &["z", "abcdefghi", "hz", "abcdefgh"], "abcdefghzyz", &[(3, 0, 8), (0, 8, 9)] ), ]; /// Tests for non-overlapping leftmost-first match semantics. These tests /// should generally be specific to leftmost-first, which means they should /// generally fail under leftmost-longest semantics. const LEFTMOST_FIRST: &'static [SearchTest] = &[ t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(leftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), t!(leftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), t!(leftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]), t!(leftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), t!(leftfirst014, &["a", ""], "a", &[(0, 0, 1)]), t!(leftfirst015, &["a", ""], "ab", &[(0, 0, 1), (1, 2, 2)]), t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), t!( leftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[(1, 1, 2), (3, 2, 4),] ), t!( leftfirst320, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(0, 0, 1), (2, 7, 9),] ), t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), t!(leftfirst400, &["amwix", "samwise", "sam"], "Zsamwix", &[(2, 1, 4)]), ]; /// Like LEFTMOST_FIRST, but for anchored searches. const ANCHORED_LEFTMOST_FIRST: &'static [SearchTest] = &[ t!(aleftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(aleftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), t!(aleftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1)]), t!(aleftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]), t!(aleftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), t!(aleftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), t!(aleftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(aleftfirst040, &["a", "ab"], "xayabbbz", &[]), t!(aleftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), t!(aleftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), t!(aleftfirst300, &["abcd", "b", "bce"], "abce", &[]), t!(aleftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[]), t!( aleftfirst320, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(0, 0, 1)] ), t!(aleftfirst330, &["a", "abab"], "abab", &[(0, 0, 1)]), t!(aleftfirst400, &["wise", "samwise", "sam"], "samwix", &[(2, 0, 3)]), ]; /// Tests for non-overlapping leftmost-longest match semantics. These tests /// should generally be specific to leftmost-longest, which means they should /// generally fail under leftmost-first semantics. const LEFTMOST_LONGEST: &'static [SearchTest] = &[ t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), t!(leftlong020, &["", "a"], "a", &[(1, 0, 1)]), t!(leftlong021, &["", "a", ""], "a", &[(1, 0, 1)]), t!(leftlong022, &["a", "", ""], "a", &[(0, 0, 1)]), t!(leftlong023, &["", "", "a"], "a", &[(2, 0, 1)]), t!(leftlong024, &["", "a"], "ab", &[(1, 0, 1), (0, 2, 2)]), t!(leftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), t!( leftlong310, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), ]; /// Like LEFTMOST_LONGEST, but for anchored searches. const ANCHORED_LEFTMOST_LONGEST: &'static [SearchTest] = &[ t!(aleftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), t!(aleftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), t!(aleftlong020, &["", "a"], "a", &[(1, 0, 1)]), t!(aleftlong021, &["", "a", ""], "a", &[(1, 0, 1)]), t!(aleftlong022, &["a", "", ""], "a", &[(0, 0, 1)]), t!(aleftlong023, &["", "", "a"], "a", &[(2, 0, 1)]), t!(aleftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), t!(aleftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), t!(aleftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), t!(aleftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), t!(aleftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), t!(aleftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), t!(aleftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), t!(aleftlong300, &["abcd", "b", "bce"], "abce", &[]), t!( aleftlong310, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!(aleftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), t!(aleftlong330, &["abcd", "b", "ce"], "abce", &[]), t!(aleftlong340, &["a", "ab"], "xayabbbz", &[]), ]; /// Tests for non-overlapping match semantics. /// /// Generally these tests shouldn't pass when using overlapping semantics. /// These should pass for both standard and leftmost match semantics. const NON_OVERLAPPING: &'static [SearchTest] = &[ t!(nover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), t!(nover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), t!(nover030, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), t!( nover100, &["ab", "ba"], "abababa", &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] ), t!(nover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), t!(nover300, &["", ""], "", &[(0, 0, 0),]), t!(nover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1),]), ]; /// Like NON_OVERLAPPING, but for anchored searches. const ANCHORED_NON_OVERLAPPING: &'static [SearchTest] = &[ t!(anover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), t!(anover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), t!(anover030, &["abc", "bc"], "zazabcz", &[]), t!( anover100, &["ab", "ba"], "abababa", &[(0, 0, 2), (0, 2, 4), (0, 4, 6)] ), t!(anover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3)]), t!(anover300, &["", ""], "", &[(0, 0, 0)]), t!(anover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]), ]; /// Tests for overlapping match semantics. /// /// This only supports standard match semantics, since leftmost-{first,longest} /// do not support overlapping matches. const OVERLAPPING: &'static [SearchTest] = &[ t!( over000, &["abcd", "bcd", "cd", "b"], "abcd", &[(3, 1, 2), (0, 0, 4), (1, 1, 4), (2, 2, 4),] ), t!( over010, &["bcd", "cd", "b", "abcd"], "abcd", &[(2, 1, 2), (3, 0, 4), (0, 1, 4), (1, 2, 4),] ), t!( over020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4), (1, 1, 4), (2, 2, 4),] ), t!( over030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4), (0, 1, 4), (2, 2, 4),] ), t!( over040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4), (0, 1, 4), (1, 2, 4),] ), t!(over050, &["abc", "bc"], "zazabcz", &[(0, 3, 6), (1, 4, 6),]), t!( over100, &["ab", "ba"], "abababa", &[(0, 0, 2), (1, 1, 3), (0, 2, 4), (1, 3, 5), (0, 4, 6), (1, 5, 7),] ), t!( over200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3), (0, 6, 9), (1, 6, 9),] ), t!(over300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), t!( over310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0), (0, 1, 1), (1, 1, 1),] ), t!(over320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1), (0, 1, 1),]), t!( over330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1), (0, 1, 1), (2, 1, 1),] ), t!( over340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1), (1, 1, 1), (2, 1, 1),] ), t!( over350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1), (0, 1, 1), (1, 1, 1),] ), t!( over360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6), (0, 3, 6)] ), ]; /* Iterators of anchored overlapping searches were removed from the API in after 0.7, but we leave the tests commented out for posterity. /// Like OVERLAPPING, but for anchored searches. const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[ t!(aover000, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), t!(aover010, &["bcd", "cd", "b", "abcd"], "abcd", &[(3, 0, 4)]), t!(aover020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4)]), t!(aover030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4)]), t!(aover040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4)]), t!(aover050, &["abc", "bc"], "zazabcz", &[]), t!(aover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]), t!(aover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3)]), t!(aover300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), t!(aover310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0)]), t!(aover320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1)]), t!(aover330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1)]), t!(aover340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1)]), t!(aover350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1)]), t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]), ]; */ /// Tests for ASCII case insensitivity. /// /// These tests should all have the same behavior regardless of match semantics /// or whether the search is overlapping. const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[ t!(acasei000, &["a"], "A", &[(0, 0, 1)]), t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]), t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]), t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]), ]; /// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests. const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[ t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]), t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]), t!(acasei010, &["abc", "def"], "abcdef", &[(0, 0, 3), (1, 3, 6)]), ]; /// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests. const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[ t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]), t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]), // This is a regression test from: // https://github.com/BurntSushi/aho-corasick/issues/68 // Previously, it was reporting a duplicate (1, 3, 6) match. t!( acasei010, &["abc", "def", "abcdef"], "abcdef", &[(0, 0, 3), (2, 0, 6), (1, 3, 6)] ), ]; /// Regression tests that are applied to all Aho-Corasick combinations. /// /// If regression tests are needed for specific match semantics, then add them /// to the appropriate group above. const REGRESSION: &'static [SearchTest] = &[ t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), t!( regression030, &["libcore/", "libstd/"], "libcore/char/methods.rs", &[(0, 0, 8),] ), t!( regression040, &["libstd/", "libcore/"], "libcore/char/methods.rs", &[(1, 0, 8),] ), t!( regression050, &["\x00\x00\x01", "\x00\x00\x00"], "\x00\x00\x00", &[(1, 0, 3),] ), t!( regression060, &["\x00\x00\x00", "\x00\x00\x01"], "\x00\x00\x00", &[(0, 0, 3),] ), ]; // Now define a test for each combination of things above that we want to run. // Since there are a few different combinations for each collection of tests, // we define a couple of macros to avoid repetition drudgery. The testconfig // macro constructs the automaton from a given match kind, and runs the search // tests one-by-one over the given collection. The `with` parameter allows one // to configure the builder with additional parameters. The testcombo macro // invokes testconfig in precisely this way: it sets up several tests where // each one turns a different knob on AhoCorasickBuilder. macro_rules! testconfig { (anchored, $name:ident, $collection:expr, $kind:ident, $with:expr) => { #[test] fn $name() { run_search_tests($collection, |test| { let mut builder = AhoCorasick::builder(); $with(&mut builder); let input = Input::new(test.haystack).anchored(Anchored::Yes); builder .match_kind(MatchKind::$kind) .build(test.patterns) .unwrap() .try_find_iter(input) .unwrap() .collect() }); } }; (overlapping, $name:ident, $collection:expr, $kind:ident, $with:expr) => { #[test] fn $name() { run_search_tests($collection, |test| { let mut builder = AhoCorasick::builder(); $with(&mut builder); builder .match_kind(MatchKind::$kind) .build(test.patterns) .unwrap() .find_overlapping_iter(test.haystack) .collect() }); } }; (stream, $name:ident, $collection:expr, $kind:ident, $with:expr) => { #[test] fn $name() { run_stream_search_tests($collection, |test| { let buf = std::io::BufReader::with_capacity( 1, test.haystack.as_bytes(), ); let mut builder = AhoCorasick::builder(); $with(&mut builder); builder .match_kind(MatchKind::$kind) .build(test.patterns) .unwrap() .stream_find_iter(buf) .map(|result| result.unwrap()) .collect() }); } }; ($name:ident, $collection:expr, $kind:ident, $with:expr) => { #[test] fn $name() { run_search_tests($collection, |test| { let mut builder = AhoCorasick::builder(); $with(&mut builder); builder .match_kind(MatchKind::$kind) .build(test.patterns) .unwrap() .find_iter(test.haystack) .collect() }); } }; } macro_rules! testcombo { ($name:ident, $collection:expr, $kind:ident) => { mod $name { use super::*; testconfig!(default, $collection, $kind, |_| ()); testconfig!( nfa_default, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)); } ); testconfig!( nfa_noncontig_no_prefilter, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) .prefilter(false); } ); testconfig!( nfa_noncontig_all_sparse, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) .dense_depth(0); } ); testconfig!( nfa_noncontig_all_dense, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) .dense_depth(usize::MAX); } ); testconfig!( nfa_contig_default, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)); } ); testconfig!( nfa_contig_no_prefilter, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)) .prefilter(false); } ); testconfig!( nfa_contig_all_sparse, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)) .dense_depth(0); } ); testconfig!( nfa_contig_all_dense, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)) .dense_depth(usize::MAX); } ); testconfig!( nfa_contig_no_byte_class, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)) .byte_classes(false); } ); testconfig!( dfa_default, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)); } ); testconfig!( dfa_start_both, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)) .start_kind(StartKind::Both); } ); testconfig!( dfa_no_prefilter, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)).prefilter(false); } ); testconfig!( dfa_start_both_no_prefilter, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)) .start_kind(StartKind::Both) .prefilter(false); } ); testconfig!( dfa_no_byte_class, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)).byte_classes(false); } ); testconfig!( dfa_start_both_no_byte_class, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)) .start_kind(StartKind::Both) .byte_classes(false); } ); } }; } // Write out the various combinations of match semantics given the variety of // configurations tested by 'testcombo!'. testcombo!(search_leftmost_longest, AC_LEFTMOST_LONGEST, LeftmostLongest); testcombo!(search_leftmost_first, AC_LEFTMOST_FIRST, LeftmostFirst); testcombo!( search_standard_nonoverlapping, AC_STANDARD_NON_OVERLAPPING, Standard ); // Write out the overlapping combo by hand since there is only one of them. testconfig!( overlapping, search_standard_overlapping_default, AC_STANDARD_OVERLAPPING, Standard, |_| () ); testconfig!( overlapping, search_standard_overlapping_nfa_noncontig_default, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)); } ); testconfig!( overlapping, search_standard_overlapping_nfa_noncontig_no_prefilter, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)).prefilter(false); } ); testconfig!( overlapping, search_standard_overlapping_nfa_contig_default, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)); } ); testconfig!( overlapping, search_standard_overlapping_nfa_contig_no_prefilter, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)).prefilter(false); } ); testconfig!( overlapping, search_standard_overlapping_nfa_contig_all_sparse, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)).dense_depth(0); } ); testconfig!( overlapping, search_standard_overlapping_nfa_contig_all_dense, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)).dense_depth(usize::MAX); } ); testconfig!( overlapping, search_standard_overlapping_dfa_default, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)); } ); testconfig!( overlapping, search_standard_overlapping_dfa_start_both, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)).start_kind(StartKind::Both); } ); testconfig!( overlapping, search_standard_overlapping_dfa_no_prefilter, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)).prefilter(false); } ); testconfig!( overlapping, search_standard_overlapping_dfa_start_both_no_prefilter, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)) .start_kind(StartKind::Both) .prefilter(false); } ); testconfig!( overlapping, search_standard_overlapping_dfa_no_byte_class, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)).byte_classes(false); } ); testconfig!( overlapping, search_standard_overlapping_dfa_start_both_no_byte_class, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)) .start_kind(StartKind::Both) .byte_classes(false); } ); // Also write out tests manually for streams, since we only test the standard // match semantics. We also don't bother testing different automaton // configurations, since those are well covered by tests above. #[cfg(feature = "std")] testconfig!( stream, search_standard_stream_default, AC_STANDARD_NON_OVERLAPPING, Standard, |_| () ); #[cfg(feature = "std")] testconfig!( stream, search_standard_stream_nfa_noncontig_default, AC_STANDARD_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)); } ); #[cfg(feature = "std")] testconfig!( stream, search_standard_stream_nfa_contig_default, AC_STANDARD_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)); } ); #[cfg(feature = "std")] testconfig!( stream, search_standard_stream_dfa_default, AC_STANDARD_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)); } ); // Same thing for anchored searches. Write them out manually. testconfig!( anchored, search_standard_anchored_default, AC_STANDARD_ANCHORED_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored); } ); testconfig!( anchored, search_standard_anchored_nfa_noncontig_default, AC_STANDARD_ANCHORED_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored) .kind(Some(AhoCorasickKind::NoncontiguousNFA)); } ); testconfig!( anchored, search_standard_anchored_nfa_contig_default, AC_STANDARD_ANCHORED_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored) .kind(Some(AhoCorasickKind::ContiguousNFA)); } ); testconfig!( anchored, search_standard_anchored_dfa_default, AC_STANDARD_ANCHORED_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored).kind(Some(AhoCorasickKind::DFA)); } ); testconfig!( anchored, search_standard_anchored_dfa_start_both, AC_STANDARD_ANCHORED_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Both).kind(Some(AhoCorasickKind::DFA)); } ); testconfig!( anchored, search_leftmost_first_anchored_default, AC_LEFTMOST_FIRST_ANCHORED, LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored); } ); testconfig!( anchored, search_leftmost_first_anchored_nfa_noncontig_default, AC_LEFTMOST_FIRST_ANCHORED, LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored) .kind(Some(AhoCorasickKind::NoncontiguousNFA)); } ); testconfig!( anchored, search_leftmost_first_anchored_nfa_contig_default, AC_LEFTMOST_FIRST_ANCHORED, LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored) .kind(Some(AhoCorasickKind::ContiguousNFA)); } ); testconfig!( anchored, search_leftmost_first_anchored_dfa_default, AC_LEFTMOST_FIRST_ANCHORED, LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored).kind(Some(AhoCorasickKind::DFA)); } ); testconfig!( anchored, search_leftmost_first_anchored_dfa_start_both, AC_LEFTMOST_FIRST_ANCHORED, LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Both).kind(Some(AhoCorasickKind::DFA)); } ); testconfig!( anchored, search_leftmost_longest_anchored_default, AC_LEFTMOST_LONGEST_ANCHORED, LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored); } ); testconfig!( anchored, search_leftmost_longest_anchored_nfa_noncontig_default, AC_LEFTMOST_LONGEST_ANCHORED, LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored) .kind(Some(AhoCorasickKind::NoncontiguousNFA)); } ); testconfig!( anchored, search_leftmost_longest_anchored_nfa_contig_default, AC_LEFTMOST_LONGEST_ANCHORED, LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored) .kind(Some(AhoCorasickKind::ContiguousNFA)); } ); testconfig!( anchored, search_leftmost_longest_anchored_dfa_default, AC_LEFTMOST_LONGEST_ANCHORED, LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Anchored).kind(Some(AhoCorasickKind::DFA)); } ); testconfig!( anchored, search_leftmost_longest_anchored_dfa_start_both, AC_LEFTMOST_LONGEST_ANCHORED, LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.start_kind(StartKind::Both).kind(Some(AhoCorasickKind::DFA)); } ); // And also write out the test combinations for ASCII case insensitivity. testconfig!( acasei_standard_default, &[ASCII_CASE_INSENSITIVE], Standard, |b: &mut AhoCorasickBuilder| { b.prefilter(false).ascii_case_insensitive(true); } ); testconfig!( acasei_standard_nfa_noncontig_default, &[ASCII_CASE_INSENSITIVE], Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) .prefilter(false) .ascii_case_insensitive(true); } ); testconfig!( acasei_standard_nfa_contig_default, &[ASCII_CASE_INSENSITIVE], Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)) .prefilter(false) .ascii_case_insensitive(true); } ); testconfig!( acasei_standard_dfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true); } ); testconfig!( overlapping, acasei_standard_overlapping_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], Standard, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true); } ); testconfig!( overlapping, acasei_standard_overlapping_nfa_noncontig_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) .ascii_case_insensitive(true); } ); testconfig!( overlapping, acasei_standard_overlapping_nfa_contig_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)) .ascii_case_insensitive(true); } ); testconfig!( overlapping, acasei_standard_overlapping_dfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], Standard, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_first_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_first_nfa_noncontig_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) .ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_first_nfa_contig_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)) .ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_first_dfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_longest_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_longest_nfa_noncontig_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::NoncontiguousNFA)) .ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_longest_nfa_contig_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::ContiguousNFA)) .ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_longest_dfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.kind(Some(AhoCorasickKind::DFA)).ascii_case_insensitive(true); } ); fn run_search_tests Vec>( which: TestCollection, mut f: F, ) { let get_match_triples = |matches: Vec| -> Vec<(usize, usize, usize)> { matches .into_iter() .map(|m| (m.pattern().as_usize(), m.start(), m.end())) .collect() }; for &tests in which { for test in tests { assert_eq!( test.matches, get_match_triples(f(&test)).as_slice(), "test: {}, patterns: {:?}, haystack: {:?}", test.name, test.patterns, test.haystack ); } } } // Like 'run_search_tests', but we skip any tests that contain the empty // pattern because stream searching doesn't support it. #[cfg(feature = "std")] fn run_stream_search_tests Vec>( which: TestCollection, mut f: F, ) { let get_match_triples = |matches: Vec| -> Vec<(usize, usize, usize)> { matches .into_iter() .map(|m| (m.pattern().as_usize(), m.start(), m.end())) .collect() }; for &tests in which { for test in tests { if test.patterns.iter().any(|p| p.is_empty()) { continue; } assert_eq!( test.matches, get_match_triples(f(&test)).as_slice(), "test: {}, patterns: {:?}, haystack: {:?}", test.name, test.patterns, test.haystack ); } } } #[test] fn search_tests_have_unique_names() { let assert = |constname, tests: &[SearchTest]| { let mut seen = HashMap::new(); // map from test name to position for (i, test) in tests.iter().enumerate() { if !seen.contains_key(test.name) { seen.insert(test.name, i); } else { let last = seen[test.name]; panic!( "{} tests have duplicate names at positions {} and {}", constname, last, i ); } } }; assert("BASICS", BASICS); assert("STANDARD", STANDARD); assert("LEFTMOST", LEFTMOST); assert("LEFTMOST_FIRST", LEFTMOST_FIRST); assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); assert("NON_OVERLAPPING", NON_OVERLAPPING); assert("OVERLAPPING", OVERLAPPING); assert("REGRESSION", REGRESSION); } #[cfg(feature = "std")] #[test] #[should_panic] fn stream_not_allowed_leftmost_first() { let fsm = AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) .build(None::) .unwrap(); assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); } #[cfg(feature = "std")] #[test] #[should_panic] fn stream_not_allowed_leftmost_longest() { let fsm = AhoCorasick::builder() .match_kind(MatchKind::LeftmostLongest) .build(None::) .unwrap(); assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); } #[test] #[should_panic] fn overlapping_not_allowed_leftmost_first() { let fsm = AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) .build(None::) .unwrap(); assert_eq!(fsm.find_overlapping_iter("").count(), 0); } #[test] #[should_panic] fn overlapping_not_allowed_leftmost_longest() { let fsm = AhoCorasick::builder() .match_kind(MatchKind::LeftmostLongest) .build(None::) .unwrap(); assert_eq!(fsm.find_overlapping_iter("").count(), 0); } // This tests that if we build an AC matcher with an "unanchored" start kind, // then we can't run an anchored search even if the underlying searcher // supports it. // // The key bit here is that both of the NFAs in this crate unconditionally // support both unanchored and anchored searches, but the DFA does not because // of the added cost of doing so. To avoid the top-level AC matcher sometimes // supporting anchored and sometimes not (depending on which searcher it // chooses to use internally), we ensure that the given 'StartKind' is always // respected. #[test] fn anchored_not_allowed_even_if_technically_available() { let ac = AhoCorasick::builder() .kind(Some(AhoCorasickKind::NoncontiguousNFA)) .start_kind(StartKind::Unanchored) .build(&["foo"]) .unwrap(); assert!(ac.try_find(Input::new("foo").anchored(Anchored::Yes)).is_err()); let ac = AhoCorasick::builder() .kind(Some(AhoCorasickKind::ContiguousNFA)) .start_kind(StartKind::Unanchored) .build(&["foo"]) .unwrap(); assert!(ac.try_find(Input::new("foo").anchored(Anchored::Yes)).is_err()); // For completeness, check that the DFA returns an error too. let ac = AhoCorasick::builder() .kind(Some(AhoCorasickKind::DFA)) .start_kind(StartKind::Unanchored) .build(&["foo"]) .unwrap(); assert!(ac.try_find(Input::new("foo").anchored(Anchored::Yes)).is_err()); } // This is like the test aboved, but with unanchored and anchored flipped. That // is, we asked for an AC searcher with anchored support and we check that // unanchored searches return an error even if the underlying searcher would // technically support it. #[test] fn unanchored_not_allowed_even_if_technically_available() { let ac = AhoCorasick::builder() .kind(Some(AhoCorasickKind::NoncontiguousNFA)) .start_kind(StartKind::Anchored) .build(&["foo"]) .unwrap(); assert!(ac.try_find(Input::new("foo").anchored(Anchored::No)).is_err()); let ac = AhoCorasick::builder() .kind(Some(AhoCorasickKind::ContiguousNFA)) .start_kind(StartKind::Anchored) .build(&["foo"]) .unwrap(); assert!(ac.try_find(Input::new("foo").anchored(Anchored::No)).is_err()); // For completeness, check that the DFA returns an error too. let ac = AhoCorasick::builder() .kind(Some(AhoCorasickKind::DFA)) .start_kind(StartKind::Anchored) .build(&["foo"]) .unwrap(); assert!(ac.try_find(Input::new("foo").anchored(Anchored::No)).is_err()); } // This tests that a prefilter does not cause a search to report a match // outside the bounds provided by the caller. // // This is a regression test for a bug I introduced during the rewrite of most // of the crate after 0.7. It was never released. The tricky part here is // ensuring we get a prefilter that can report matches on its own (such as the // packed searcher). Otherwise, prefilters that report false positives might // have searched past the bounds provided by the caller, but confirming the // match would subsequently fail. #[test] fn prefilter_stays_in_bounds() { let ac = AhoCorasick::builder() .match_kind(MatchKind::LeftmostFirst) .build(&["sam", "frodo", "pippin", "merry", "gandalf", "sauron"]) .unwrap(); let haystack = "foo gandalf"; assert_eq!(None, ac.find(Input::new(haystack).range(0..10))); } // See: https://github.com/BurntSushi/aho-corasick/issues/44 // // In short, this test ensures that enabling ASCII case insensitivity does not // visit an exponential number of states when filling in failure transitions. #[test] fn regression_ascii_case_insensitive_no_exponential() { let ac = AhoCorasick::builder() .ascii_case_insensitive(true) .build(&["Tsubaki House-Triple Shot Vol01校花三姐妹"]) .unwrap(); assert!(ac.find("").is_none()); } // See: https://github.com/BurntSushi/aho-corasick/issues/53 // // This test ensures that the rare byte prefilter works in a particular corner // case. In particular, the shift offset detected for '/' in the patterns below // was incorrect, leading to a false negative. #[test] fn regression_rare_byte_prefilter() { use crate::AhoCorasick; let ac = AhoCorasick::new(&["ab/j/", "x/"]).unwrap(); assert!(ac.is_match("ab/j/")); } #[test] fn regression_case_insensitive_prefilter() { for c in b'a'..b'z' { for c2 in b'a'..b'z' { let c = c as char; let c2 = c2 as char; let needle = format!("{}{}", c, c2).to_lowercase(); let haystack = needle.to_uppercase(); let ac = AhoCorasick::builder() .ascii_case_insensitive(true) .prefilter(true) .build(&[&needle]) .unwrap(); assert_eq!( 1, ac.find_iter(&haystack).count(), "failed to find {:?} in {:?}\n\nautomaton:\n{:?}", needle, haystack, ac, ); } } } // See: https://github.com/BurntSushi/aho-corasick/issues/64 // // This occurs when the rare byte prefilter is active. #[cfg(feature = "std")] #[test] fn regression_stream_rare_byte_prefilter() { use std::io::Read; // NOTE: The test only fails if this ends with j. const MAGIC: [u8; 5] = *b"1234j"; // NOTE: The test fails for value in 8188..=8191 These value put the string // to search accross two call to read because the buffer size is 64KB by // default. const BEGIN: usize = 65_535; /// This is just a structure that implements Reader. The reader /// implementation will simulate a file filled with 0, except for the MAGIC /// string at offset BEGIN. #[derive(Default)] struct R { read: usize, } impl Read for R { fn read(&mut self, buf: &mut [u8]) -> std::io::Result { if self.read > 100000 { return Ok(0); } let mut from = 0; if self.read < BEGIN { from = buf.len().min(BEGIN - self.read); for x in 0..from { buf[x] = 0; } self.read += from; } if self.read >= BEGIN && self.read <= BEGIN + MAGIC.len() { let to = buf.len().min(BEGIN + MAGIC.len() - self.read + from); if to > from { buf[from..to].copy_from_slice( &MAGIC [self.read - BEGIN..self.read - BEGIN + to - from], ); self.read += to - from; from = to; } } for x in from..buf.len() { buf[x] = 0; self.read += 1; } Ok(buf.len()) } } fn run() -> std::io::Result<()> { let aut = AhoCorasick::builder() // Enable byte classes to make debugging the automaton easier. It // should have no effect on the test result. .byte_classes(false) .build(&[&MAGIC]) .unwrap(); // While reading from a vector, it works: let mut buf = alloc::vec![]; R::default().read_to_end(&mut buf)?; let from_whole = aut.find_iter(&buf).next().unwrap().start(); // But using stream_find_iter fails! let mut file = std::io::BufReader::new(R::default()); let begin = aut .stream_find_iter(&mut file) .next() .expect("NOT FOUND!!!!")? // Panic here .start(); assert_eq!(from_whole, begin); Ok(()) } run().unwrap() } aho-corasick-1.1.3/src/transducer.rs000064400000000000000000000202411046102023000154770ustar 00000000000000/*! Provides implementations of `fst::Automaton` for Aho-Corasick automata. This works by providing two wrapper types, [`Anchored`] and [`Unanchored`]. The former executes an anchored search on an FST while the latter executes an unanchored search. Building these wrappers is fallible and will fail if the underlying Aho-Corasick automaton does not support the type of search it represents. */ use crate::{ automaton::{Automaton, StateID}, Anchored as AcAnchored, Input, MatchError, }; /// Represents an unanchored Aho-Corasick search of a finite state transducer. /// /// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the /// underlying automaton does not support unanchored searches. /// /// # Example /// /// This shows how to build an FST of keys and then run an unanchored search on /// those keys using an Aho-Corasick automaton. /// /// ``` /// use aho_corasick::{nfa::contiguous::NFA, transducer::Unanchored}; /// use fst::{Automaton, IntoStreamer, Set, Streamer}; /// /// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap(); /// let nfa = NFA::new(&["bcd", "x"]).unwrap(); /// // NFAs always support both unanchored and anchored searches. /// let searcher = Unanchored::new(&nfa).unwrap(); /// /// let mut stream = set.search(searcher).into_stream(); /// let mut results = vec![]; /// while let Some(key) = stream.next() { /// results.push(std::str::from_utf8(key).unwrap().to_string()); /// } /// assert_eq!(vec!["abcd", "bcd", "xyz"], results); /// ``` #[derive(Clone, Debug)] pub struct Unanchored(A); impl Unanchored { /// Create a new `Unanchored` implementation of the `fst::Automaton` trait. /// /// If the given Aho-Corasick automaton does not support unanchored /// searches, then this returns an error. pub fn new(aut: A) -> Result, MatchError> { let input = Input::new("").anchored(AcAnchored::No); let _ = aut.start_state(&input)?; Ok(Unanchored(aut)) } /// Returns a borrow to the underlying automaton. pub fn as_ref(&self) -> &A { &self.0 } /// Unwrap this value and return the inner automaton. pub fn into_inner(self) -> A { self.0 } } impl fst::Automaton for Unanchored { type State = StateID; #[inline] fn start(&self) -> StateID { let input = Input::new("").anchored(AcAnchored::No); self.0.start_state(&input).expect("support for unanchored searches") } #[inline] fn is_match(&self, state: &StateID) -> bool { self.0.is_match(*state) } #[inline] fn accept(&self, state: &StateID, byte: u8) -> StateID { if fst::Automaton::is_match(self, state) { return *state; } self.0.next_state(AcAnchored::No, *state, byte) } #[inline] fn can_match(&self, state: &StateID) -> bool { !self.0.is_dead(*state) } } /// Represents an anchored Aho-Corasick search of a finite state transducer. /// /// Wrapping an Aho-Corasick automaton in `Unanchored` will fail if the /// underlying automaton does not support unanchored searches. /// /// # Example /// /// This shows how to build an FST of keys and then run an anchored search on /// those keys using an Aho-Corasick automaton. /// /// ``` /// use aho_corasick::{nfa::contiguous::NFA, transducer::Anchored}; /// use fst::{Automaton, IntoStreamer, Set, Streamer}; /// /// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap(); /// let nfa = NFA::new(&["bcd", "x"]).unwrap(); /// // NFAs always support both unanchored and anchored searches. /// let searcher = Anchored::new(&nfa).unwrap(); /// /// let mut stream = set.search(searcher).into_stream(); /// let mut results = vec![]; /// while let Some(key) = stream.next() { /// results.push(std::str::from_utf8(key).unwrap().to_string()); /// } /// assert_eq!(vec!["bcd", "xyz"], results); /// ``` /// /// This is like the example above, except we use an Aho-Corasick DFA, which /// requires explicitly configuring it to support anchored searches. (NFAs /// unconditionally support both unanchored and anchored searches.) /// /// ``` /// use aho_corasick::{dfa::DFA, transducer::Anchored, StartKind}; /// use fst::{Automaton, IntoStreamer, Set, Streamer}; /// /// let set = Set::from_iter(&["abcd", "bc", "bcd", "xyz"]).unwrap(); /// let dfa = DFA::builder() /// .start_kind(StartKind::Anchored) /// .build(&["bcd", "x"]) /// .unwrap(); /// // We've explicitly configured our DFA to support anchored searches. /// let searcher = Anchored::new(&dfa).unwrap(); /// /// let mut stream = set.search(searcher).into_stream(); /// let mut results = vec![]; /// while let Some(key) = stream.next() { /// results.push(std::str::from_utf8(key).unwrap().to_string()); /// } /// assert_eq!(vec!["bcd", "xyz"], results); /// ``` #[derive(Clone, Debug)] pub struct Anchored(A); impl Anchored { /// Create a new `Anchored` implementation of the `fst::Automaton` trait. /// /// If the given Aho-Corasick automaton does not support anchored searches, /// then this returns an error. pub fn new(aut: A) -> Result, MatchError> { let input = Input::new("").anchored(AcAnchored::Yes); let _ = aut.start_state(&input)?; Ok(Anchored(aut)) } /// Returns a borrow to the underlying automaton. pub fn as_ref(&self) -> &A { &self.0 } /// Unwrap this value and return the inner automaton. pub fn into_inner(self) -> A { self.0 } } impl fst::Automaton for Anchored { type State = StateID; #[inline] fn start(&self) -> StateID { let input = Input::new("").anchored(AcAnchored::Yes); self.0.start_state(&input).expect("support for unanchored searches") } #[inline] fn is_match(&self, state: &StateID) -> bool { self.0.is_match(*state) } #[inline] fn accept(&self, state: &StateID, byte: u8) -> StateID { if fst::Automaton::is_match(self, state) { return *state; } self.0.next_state(AcAnchored::Yes, *state, byte) } #[inline] fn can_match(&self, state: &StateID) -> bool { !self.0.is_dead(*state) } } #[cfg(test)] mod tests { use alloc::{string::String, vec, vec::Vec}; use fst::{Automaton, IntoStreamer, Set, Streamer}; use crate::{ dfa::DFA, nfa::{contiguous, noncontiguous}, StartKind, }; use super::*; fn search>( set: &Set, aut: A, ) -> Vec { let mut stream = set.search(aut).into_stream(); let mut results = vec![]; while let Some(key) = stream.next() { results.push(String::from(core::str::from_utf8(key).unwrap())); } results } #[test] fn unanchored() { let set = Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) .unwrap(); let patterns = vec!["baz", "bax"]; let expected = vec!["baz", "xbax"]; let aut = Unanchored(noncontiguous::NFA::new(&patterns).unwrap()); let got = search(&set, &aut); assert_eq!(got, expected); let aut = Unanchored(contiguous::NFA::new(&patterns).unwrap()); let got = search(&set, &aut); assert_eq!(got, expected); let aut = Unanchored(DFA::new(&patterns).unwrap()); let got = search(&set, &aut); assert_eq!(got, expected); } #[test] fn anchored() { let set = Set::from_iter(&["a", "bar", "baz", "wat", "xba", "xbax", "z"]) .unwrap(); let patterns = vec!["baz", "bax"]; let expected = vec!["baz"]; let aut = Anchored(noncontiguous::NFA::new(&patterns).unwrap()); let got = search(&set, &aut); assert_eq!(got, expected); let aut = Anchored(contiguous::NFA::new(&patterns).unwrap()); let got = search(&set, &aut); assert_eq!(got, expected); let aut = Anchored( DFA::builder() .start_kind(StartKind::Anchored) .build(&patterns) .unwrap(), ); let got = search(&set, &aut); assert_eq!(got, expected); } } aho-corasick-1.1.3/src/util/alphabet.rs000064400000000000000000000322701046102023000160670ustar 00000000000000use crate::util::int::Usize; /// A representation of byte oriented equivalence classes. /// /// This is used in finite state machines to reduce the size of the transition /// table. This can have a particularly large impact not only on the total size /// of an FSM, but also on FSM build times because it reduces the number of /// transitions that need to be visited/set. #[derive(Clone, Copy)] pub(crate) struct ByteClasses([u8; 256]); impl ByteClasses { /// Creates a new set of equivalence classes where all bytes are mapped to /// the same class. pub(crate) fn empty() -> ByteClasses { ByteClasses([0; 256]) } /// Creates a new set of equivalence classes where each byte belongs to /// its own equivalence class. pub(crate) fn singletons() -> ByteClasses { let mut classes = ByteClasses::empty(); for b in 0..=255 { classes.set(b, b); } classes } /// Set the equivalence class for the given byte. #[inline] pub(crate) fn set(&mut self, byte: u8, class: u8) { self.0[usize::from(byte)] = class; } /// Get the equivalence class for the given byte. #[inline] pub(crate) fn get(&self, byte: u8) -> u8 { self.0[usize::from(byte)] } /// Return the total number of elements in the alphabet represented by /// these equivalence classes. Equivalently, this returns the total number /// of equivalence classes. #[inline] pub(crate) fn alphabet_len(&self) -> usize { // Add one since the number of equivalence classes is one bigger than // the last one. usize::from(self.0[255]) + 1 } /// Returns the stride, as a base-2 exponent, required for these /// equivalence classes. /// /// The stride is always the smallest power of 2 that is greater than or /// equal to the alphabet length. This is done so that converting between /// state IDs and indices can be done with shifts alone, which is much /// faster than integer division. The "stride2" is the exponent. i.e., /// `2^stride2 = stride`. pub(crate) fn stride2(&self) -> usize { let zeros = self.alphabet_len().next_power_of_two().trailing_zeros(); usize::try_from(zeros).unwrap() } /// Returns the stride for these equivalence classes, which corresponds /// to the smallest power of 2 greater than or equal to the number of /// equivalence classes. pub(crate) fn stride(&self) -> usize { 1 << self.stride2() } /// Returns true if and only if every byte in this class maps to its own /// equivalence class. Equivalently, there are 257 equivalence classes /// and each class contains exactly one byte (plus the special EOI class). #[inline] pub(crate) fn is_singleton(&self) -> bool { self.alphabet_len() == 256 } /// Returns an iterator over all equivalence classes in this set. pub(crate) fn iter(&self) -> ByteClassIter { ByteClassIter { it: 0..self.alphabet_len() } } /// Returns an iterator of the bytes in the given equivalence class. pub(crate) fn elements(&self, class: u8) -> ByteClassElements { ByteClassElements { classes: self, class, bytes: 0..=255 } } /// Returns an iterator of byte ranges in the given equivalence class. /// /// That is, a sequence of contiguous ranges are returned. Typically, every /// class maps to a single contiguous range. fn element_ranges(&self, class: u8) -> ByteClassElementRanges { ByteClassElementRanges { elements: self.elements(class), range: None } } } impl core::fmt::Debug for ByteClasses { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { if self.is_singleton() { write!(f, "ByteClasses()") } else { write!(f, "ByteClasses(")?; for (i, class) in self.iter().enumerate() { if i > 0 { write!(f, ", ")?; } write!(f, "{:?} => [", class)?; for (start, end) in self.element_ranges(class) { if start == end { write!(f, "{:?}", start)?; } else { write!(f, "{:?}-{:?}", start, end)?; } } write!(f, "]")?; } write!(f, ")") } } } /// An iterator over each equivalence class. #[derive(Debug)] pub(crate) struct ByteClassIter { it: core::ops::Range, } impl Iterator for ByteClassIter { type Item = u8; fn next(&mut self) -> Option { self.it.next().map(|class| class.as_u8()) } } /// An iterator over all elements in a specific equivalence class. #[derive(Debug)] pub(crate) struct ByteClassElements<'a> { classes: &'a ByteClasses, class: u8, bytes: core::ops::RangeInclusive, } impl<'a> Iterator for ByteClassElements<'a> { type Item = u8; fn next(&mut self) -> Option { while let Some(byte) = self.bytes.next() { if self.class == self.classes.get(byte) { return Some(byte); } } None } } /// An iterator over all elements in an equivalence class expressed as a /// sequence of contiguous ranges. #[derive(Debug)] pub(crate) struct ByteClassElementRanges<'a> { elements: ByteClassElements<'a>, range: Option<(u8, u8)>, } impl<'a> Iterator for ByteClassElementRanges<'a> { type Item = (u8, u8); fn next(&mut self) -> Option<(u8, u8)> { loop { let element = match self.elements.next() { None => return self.range.take(), Some(element) => element, }; match self.range.take() { None => { self.range = Some((element, element)); } Some((start, end)) => { if usize::from(end) + 1 != usize::from(element) { self.range = Some((element, element)); return Some((start, end)); } self.range = Some((start, element)); } } } } } /// A partitioning of bytes into equivalence classes. /// /// A byte class set keeps track of an *approximation* of equivalence classes /// of bytes during NFA construction. That is, every byte in an equivalence /// class cannot discriminate between a match and a non-match. /// /// Note that this may not compute the minimal set of equivalence classes. /// Basically, any byte in a pattern given to the noncontiguous NFA builder /// will automatically be treated as its own equivalence class. All other /// bytes---any byte not in any pattern---will be treated as their own /// equivalence classes. In theory, all bytes not in any pattern should /// be part of a single equivalence class, but in practice, we only treat /// contiguous ranges of bytes as an equivalence class. So the number of /// classes computed may be bigger than necessary. This usually doesn't make /// much of a difference, and keeps the implementation simple. #[derive(Clone, Debug)] pub(crate) struct ByteClassSet(ByteSet); impl Default for ByteClassSet { fn default() -> ByteClassSet { ByteClassSet::empty() } } impl ByteClassSet { /// Create a new set of byte classes where all bytes are part of the same /// equivalence class. pub(crate) fn empty() -> Self { ByteClassSet(ByteSet::empty()) } /// Indicate the the range of byte given (inclusive) can discriminate a /// match between it and all other bytes outside of the range. pub(crate) fn set_range(&mut self, start: u8, end: u8) { debug_assert!(start <= end); if start > 0 { self.0.add(start - 1); } self.0.add(end); } /// Convert this boolean set to a map that maps all byte values to their /// corresponding equivalence class. The last mapping indicates the largest /// equivalence class identifier (which is never bigger than 255). pub(crate) fn byte_classes(&self) -> ByteClasses { let mut classes = ByteClasses::empty(); let mut class = 0u8; let mut b = 0u8; loop { classes.set(b, class); if b == 255 { break; } if self.0.contains(b) { class = class.checked_add(1).unwrap(); } b = b.checked_add(1).unwrap(); } classes } } /// A simple set of bytes that is reasonably cheap to copy and allocation free. #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub(crate) struct ByteSet { bits: BitSet, } /// The representation of a byte set. Split out so that we can define a /// convenient Debug impl for it while keeping "ByteSet" in the output. #[derive(Clone, Copy, Default, Eq, PartialEq)] struct BitSet([u128; 2]); impl ByteSet { /// Create an empty set of bytes. pub(crate) fn empty() -> ByteSet { ByteSet { bits: BitSet([0; 2]) } } /// Add a byte to this set. /// /// If the given byte already belongs to this set, then this is a no-op. pub(crate) fn add(&mut self, byte: u8) { let bucket = byte / 128; let bit = byte % 128; self.bits.0[usize::from(bucket)] |= 1 << bit; } /// Return true if and only if the given byte is in this set. pub(crate) fn contains(&self, byte: u8) -> bool { let bucket = byte / 128; let bit = byte % 128; self.bits.0[usize::from(bucket)] & (1 << bit) > 0 } } impl core::fmt::Debug for BitSet { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let mut fmtd = f.debug_set(); for b in 0u8..=255 { if (ByteSet { bits: *self }).contains(b) { fmtd.entry(&b); } } fmtd.finish() } } #[cfg(test)] mod tests { use alloc::{vec, vec::Vec}; use super::*; #[test] fn byte_classes() { let mut set = ByteClassSet::empty(); set.set_range(b'a', b'z'); let classes = set.byte_classes(); assert_eq!(classes.get(0), 0); assert_eq!(classes.get(1), 0); assert_eq!(classes.get(2), 0); assert_eq!(classes.get(b'a' - 1), 0); assert_eq!(classes.get(b'a'), 1); assert_eq!(classes.get(b'm'), 1); assert_eq!(classes.get(b'z'), 1); assert_eq!(classes.get(b'z' + 1), 2); assert_eq!(classes.get(254), 2); assert_eq!(classes.get(255), 2); let mut set = ByteClassSet::empty(); set.set_range(0, 2); set.set_range(4, 6); let classes = set.byte_classes(); assert_eq!(classes.get(0), 0); assert_eq!(classes.get(1), 0); assert_eq!(classes.get(2), 0); assert_eq!(classes.get(3), 1); assert_eq!(classes.get(4), 2); assert_eq!(classes.get(5), 2); assert_eq!(classes.get(6), 2); assert_eq!(classes.get(7), 3); assert_eq!(classes.get(255), 3); } #[test] fn full_byte_classes() { let mut set = ByteClassSet::empty(); for b in 0u8..=255 { set.set_range(b, b); } assert_eq!(set.byte_classes().alphabet_len(), 256); } #[test] fn elements_typical() { let mut set = ByteClassSet::empty(); set.set_range(b'b', b'd'); set.set_range(b'g', b'm'); set.set_range(b'z', b'z'); let classes = set.byte_classes(); // class 0: \x00-a // class 1: b-d // class 2: e-f // class 3: g-m // class 4: n-y // class 5: z-z // class 6: \x7B-\xFF assert_eq!(classes.alphabet_len(), 7); let elements = classes.elements(0).collect::>(); assert_eq!(elements.len(), 98); assert_eq!(elements[0], b'\x00'); assert_eq!(elements[97], b'a'); let elements = classes.elements(1).collect::>(); assert_eq!(elements, vec![b'b', b'c', b'd'],); let elements = classes.elements(2).collect::>(); assert_eq!(elements, vec![b'e', b'f'],); let elements = classes.elements(3).collect::>(); assert_eq!(elements, vec![b'g', b'h', b'i', b'j', b'k', b'l', b'm',],); let elements = classes.elements(4).collect::>(); assert_eq!(elements.len(), 12); assert_eq!(elements[0], b'n'); assert_eq!(elements[11], b'y'); let elements = classes.elements(5).collect::>(); assert_eq!(elements, vec![b'z']); let elements = classes.elements(6).collect::>(); assert_eq!(elements.len(), 133); assert_eq!(elements[0], b'\x7B'); assert_eq!(elements[132], b'\xFF'); } #[test] fn elements_singletons() { let classes = ByteClasses::singletons(); assert_eq!(classes.alphabet_len(), 256); let elements = classes.elements(b'a').collect::>(); assert_eq!(elements, vec![b'a']); } #[test] fn elements_empty() { let classes = ByteClasses::empty(); assert_eq!(classes.alphabet_len(), 1); let elements = classes.elements(0).collect::>(); assert_eq!(elements.len(), 256); assert_eq!(elements[0], b'\x00'); assert_eq!(elements[255], b'\xFF'); } } aho-corasick-1.1.3/src/util/buffer.rs000064400000000000000000000126131046102023000155570ustar 00000000000000use alloc::{vec, vec::Vec}; /// The default buffer capacity that we use for the stream buffer. const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB /// A fairly simple roll buffer for supporting stream searches. /// /// This buffer acts as a temporary place to store a fixed amount of data when /// reading from a stream. Its central purpose is to allow "rolling" some /// suffix of the data to the beginning of the buffer before refilling it with /// more data from the stream. For example, let's say we are trying to match /// "foobar" on a stream. When we report the match, we'd like to not only /// report the correct offsets at which the match occurs, but also the matching /// bytes themselves. So let's say our stream is a file with the following /// contents: `test test foobar test test`. Now assume that we happen to read /// the aforementioned file in two chunks: `test test foo` and `bar test test`. /// Naively, it would not be possible to report a single contiguous `foobar` /// match, but this roll buffer allows us to do that. Namely, after the second /// read, the contents of the buffer should be `st foobar test test`, where the /// search should ultimately resume immediately after `foo`. (The prefix `st ` /// is included because the roll buffer saves N bytes at the end of the buffer, /// where N is the maximum possible length of a match.) /// /// A lot of the logic for dealing with this is unfortunately split out between /// this roll buffer and the `StreamChunkIter`. /// /// Note also that this buffer is not actually required to just report matches. /// Because a `Match` is just some offsets. But it *is* required for supporting /// things like `try_stream_replace_all` because that needs some mechanism for /// knowing which bytes in the stream correspond to a match and which don't. So /// when a match occurs across two `read` calls, *something* needs to retain /// the bytes from the previous `read` call because you don't know before the /// second read call whether a match exists or not. #[derive(Debug)] pub(crate) struct Buffer { /// The raw buffer contents. This has a fixed size and never increases. buf: Vec, /// The minimum size of the buffer, which is equivalent to the maximum /// possible length of a match. This corresponds to the amount that we /// roll min: usize, /// The end of the contents of this buffer. end: usize, } impl Buffer { /// Create a new buffer for stream searching. The minimum buffer length /// given should be the size of the maximum possible match length. pub(crate) fn new(min_buffer_len: usize) -> Buffer { let min = core::cmp::max(1, min_buffer_len); // The minimum buffer amount is also the amount that we roll our // buffer in order to support incremental searching. To this end, // our actual capacity needs to be at least 1 byte bigger than our // minimum amount, otherwise we won't have any overlap. In actuality, // we want our buffer to be a bit bigger than that for performance // reasons, so we set a lower bound of `8 * min`. // // TODO: It would be good to find a way to test the streaming // implementation with the minimal buffer size. For now, we just // uncomment out the next line and comment out the subsequent line. // let capacity = 1 + min; let capacity = core::cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); Buffer { buf: vec![0; capacity], min, end: 0 } } /// Return the contents of this buffer. #[inline] pub(crate) fn buffer(&self) -> &[u8] { &self.buf[..self.end] } /// Return the minimum size of the buffer. The only way a buffer may be /// smaller than this is if the stream itself contains less than the /// minimum buffer amount. #[inline] pub(crate) fn min_buffer_len(&self) -> usize { self.min } /// Return all free capacity in this buffer. fn free_buffer(&mut self) -> &mut [u8] { &mut self.buf[self.end..] } /// Refill the contents of this buffer by reading as much as possible into /// this buffer's free capacity. If no more bytes could be read, then this /// returns false. Otherwise, this reads until it has filled the buffer /// past the minimum amount. pub(crate) fn fill( &mut self, mut rdr: R, ) -> std::io::Result { let mut readany = false; loop { let readlen = rdr.read(self.free_buffer())?; if readlen == 0 { return Ok(readany); } readany = true; self.end += readlen; if self.buffer().len() >= self.min { return Ok(true); } } } /// Roll the contents of the buffer so that the suffix of this buffer is /// moved to the front and all other contents are dropped. The size of the /// suffix corresponds precisely to the minimum buffer length. /// /// This should only be called when the entire contents of this buffer have /// been searched. pub(crate) fn roll(&mut self) { let roll_start = self .end .checked_sub(self.min) .expect("buffer capacity should be bigger than minimum amount"); let roll_end = roll_start + self.min; assert!(roll_end <= self.end); self.buf.copy_within(roll_start..roll_end, 0); self.end = self.min; } } aho-corasick-1.1.3/src/util/byte_frequencies.rs000064400000000000000000000105171046102023000176430ustar 00000000000000pub const BYTE_FREQUENCIES: [u8; 256] = [ 55, // '\x00' 52, // '\x01' 51, // '\x02' 50, // '\x03' 49, // '\x04' 48, // '\x05' 47, // '\x06' 46, // '\x07' 45, // '\x08' 103, // '\t' 242, // '\n' 66, // '\x0b' 67, // '\x0c' 229, // '\r' 44, // '\x0e' 43, // '\x0f' 42, // '\x10' 41, // '\x11' 40, // '\x12' 39, // '\x13' 38, // '\x14' 37, // '\x15' 36, // '\x16' 35, // '\x17' 34, // '\x18' 33, // '\x19' 56, // '\x1a' 32, // '\x1b' 31, // '\x1c' 30, // '\x1d' 29, // '\x1e' 28, // '\x1f' 255, // ' ' 148, // '!' 164, // '"' 149, // '#' 136, // '$' 160, // '%' 155, // '&' 173, // "'" 221, // '(' 222, // ')' 134, // '*' 122, // '+' 232, // ',' 202, // '-' 215, // '.' 224, // '/' 208, // '0' 220, // '1' 204, // '2' 187, // '3' 183, // '4' 179, // '5' 177, // '6' 168, // '7' 178, // '8' 200, // '9' 226, // ':' 195, // ';' 154, // '<' 184, // '=' 174, // '>' 126, // '?' 120, // '@' 191, // 'A' 157, // 'B' 194, // 'C' 170, // 'D' 189, // 'E' 162, // 'F' 161, // 'G' 150, // 'H' 193, // 'I' 142, // 'J' 137, // 'K' 171, // 'L' 176, // 'M' 185, // 'N' 167, // 'O' 186, // 'P' 112, // 'Q' 175, // 'R' 192, // 'S' 188, // 'T' 156, // 'U' 140, // 'V' 143, // 'W' 123, // 'X' 133, // 'Y' 128, // 'Z' 147, // '[' 138, // '\\' 146, // ']' 114, // '^' 223, // '_' 151, // '`' 249, // 'a' 216, // 'b' 238, // 'c' 236, // 'd' 253, // 'e' 227, // 'f' 218, // 'g' 230, // 'h' 247, // 'i' 135, // 'j' 180, // 'k' 241, // 'l' 233, // 'm' 246, // 'n' 244, // 'o' 231, // 'p' 139, // 'q' 245, // 'r' 243, // 's' 251, // 't' 235, // 'u' 201, // 'v' 196, // 'w' 240, // 'x' 214, // 'y' 152, // 'z' 182, // '{' 205, // '|' 181, // '}' 127, // '~' 27, // '\x7f' 212, // '\x80' 211, // '\x81' 210, // '\x82' 213, // '\x83' 228, // '\x84' 197, // '\x85' 169, // '\x86' 159, // '\x87' 131, // '\x88' 172, // '\x89' 105, // '\x8a' 80, // '\x8b' 98, // '\x8c' 96, // '\x8d' 97, // '\x8e' 81, // '\x8f' 207, // '\x90' 145, // '\x91' 116, // '\x92' 115, // '\x93' 144, // '\x94' 130, // '\x95' 153, // '\x96' 121, // '\x97' 107, // '\x98' 132, // '\x99' 109, // '\x9a' 110, // '\x9b' 124, // '\x9c' 111, // '\x9d' 82, // '\x9e' 108, // '\x9f' 118, // '\xa0' 141, // '¡' 113, // '¢' 129, // '£' 119, // '¤' 125, // '¥' 165, // '¦' 117, // '§' 92, // '¨' 106, // '©' 83, // 'ª' 72, // '«' 99, // '¬' 93, // '\xad' 65, // '®' 79, // '¯' 166, // '°' 237, // '±' 163, // '²' 199, // '³' 190, // '´' 225, // 'µ' 209, // '¶' 203, // '·' 198, // '¸' 217, // '¹' 219, // 'º' 206, // '»' 234, // '¼' 248, // '½' 158, // '¾' 239, // '¿' 255, // 'À' 255, // 'Á' 255, // 'Â' 255, // 'Ã' 255, // 'Ä' 255, // 'Å' 255, // 'Æ' 255, // 'Ç' 255, // 'È' 255, // 'É' 255, // 'Ê' 255, // 'Ë' 255, // 'Ì' 255, // 'Í' 255, // 'Î' 255, // 'Ï' 255, // 'Ð' 255, // 'Ñ' 255, // 'Ò' 255, // 'Ó' 255, // 'Ô' 255, // 'Õ' 255, // 'Ö' 255, // '×' 255, // 'Ø' 255, // 'Ù' 255, // 'Ú' 255, // 'Û' 255, // 'Ü' 255, // 'Ý' 255, // 'Þ' 255, // 'ß' 255, // 'à' 255, // 'á' 255, // 'â' 255, // 'ã' 255, // 'ä' 255, // 'å' 255, // 'æ' 255, // 'ç' 255, // 'è' 255, // 'é' 255, // 'ê' 255, // 'ë' 255, // 'ì' 255, // 'í' 255, // 'î' 255, // 'ï' 255, // 'ð' 255, // 'ñ' 255, // 'ò' 255, // 'ó' 255, // 'ô' 255, // 'õ' 255, // 'ö' 255, // '÷' 255, // 'ø' 255, // 'ù' 255, // 'ú' 255, // 'û' 255, // 'ü' 255, // 'ý' 255, // 'þ' 255, // 'ÿ' ]; aho-corasick-1.1.3/src/util/debug.rs000064400000000000000000000017251046102023000153760ustar 00000000000000/// A type that wraps a single byte with a convenient fmt::Debug impl that /// escapes the byte. pub(crate) struct DebugByte(pub(crate) u8); impl core::fmt::Debug for DebugByte { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { // Special case ASCII space. It's too hard to read otherwise, so // put quotes around it. I sometimes wonder whether just '\x20' would // be better... if self.0 == b' ' { return write!(f, "' '"); } // 10 bytes is enough to cover any output from ascii::escape_default. let mut bytes = [0u8; 10]; let mut len = 0; for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { // capitalize \xab to \xAB if i >= 2 && b'a' <= b && b <= b'f' { b -= 32; } bytes[len] = b; len += 1; } write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) } } aho-corasick-1.1.3/src/util/error.rs000064400000000000000000000222111046102023000154320ustar 00000000000000use crate::util::{ primitives::{PatternID, SmallIndex}, search::MatchKind, }; /// An error that occurred during the construction of an Aho-Corasick /// automaton. /// /// Build errors occur when some kind of limit has been exceeded, either in the /// number of states, the number of patterns of the length of a pattern. These /// limits aren't part of the public API, but they should generally be large /// enough to handle most use cases. /// /// When the `std` feature is enabled, this implements the `std::error::Error` /// trait. #[derive(Clone, Debug)] pub struct BuildError { kind: ErrorKind, } /// The kind of error that occurred. #[derive(Clone, Debug)] enum ErrorKind { /// An error that occurs when allocating a new state would result in an /// identifier that exceeds the capacity of a `StateID`. StateIDOverflow { /// The maximum possible id. max: u64, /// The maximum ID requested. requested_max: u64, }, /// An error that occurs when adding a pattern to an Aho-Corasick /// automaton would result in an identifier that exceeds the capacity of a /// `PatternID`. PatternIDOverflow { /// The maximum possible id. max: u64, /// The maximum ID requested. requested_max: u64, }, /// Occurs when a pattern string is given to the Aho-Corasick constructor /// that is too long. PatternTooLong { /// The ID of the pattern that was too long. pattern: PatternID, /// The length that was too long. len: usize, }, } impl BuildError { pub(crate) fn state_id_overflow( max: u64, requested_max: u64, ) -> BuildError { BuildError { kind: ErrorKind::StateIDOverflow { max, requested_max } } } pub(crate) fn pattern_id_overflow( max: u64, requested_max: u64, ) -> BuildError { BuildError { kind: ErrorKind::PatternIDOverflow { max, requested_max }, } } pub(crate) fn pattern_too_long( pattern: PatternID, len: usize, ) -> BuildError { BuildError { kind: ErrorKind::PatternTooLong { pattern, len } } } } #[cfg(feature = "std")] impl std::error::Error for BuildError {} impl core::fmt::Display for BuildError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self.kind { ErrorKind::StateIDOverflow { max, requested_max } => { write!( f, "state identifier overflow: failed to create state ID \ from {}, which exceeds the max of {}", requested_max, max, ) } ErrorKind::PatternIDOverflow { max, requested_max } => { write!( f, "pattern identifier overflow: failed to create pattern ID \ from {}, which exceeds the max of {}", requested_max, max, ) } ErrorKind::PatternTooLong { pattern, len } => { write!( f, "pattern {} with length {} exceeds \ the maximum pattern length of {}", pattern.as_usize(), len, SmallIndex::MAX.as_usize(), ) } } } } /// An error that occurred during an Aho-Corasick search. /// /// An error that occurs during a search is limited to some kind of /// misconfiguration that resulted in an illegal call. Stated differently, /// whether an error occurs is not dependent on the specific bytes in the /// haystack. /// /// Examples of misconfiguration: /// /// * Executing a stream or overlapping search on a searcher that was built was /// something other than [`MatchKind::Standard`](crate::MatchKind::Standard) /// semantics. /// * Requested an anchored or an unanchored search on a searcher that doesn't /// support unanchored or anchored searches, respectively. /// /// When the `std` feature is enabled, this implements the `std::error::Error` /// trait. #[derive(Clone, Debug, Eq, PartialEq)] pub struct MatchError(alloc::boxed::Box); impl MatchError { /// Create a new error value with the given kind. /// /// This is a more verbose version of the kind-specific constructors, e.g., /// `MatchError::unsupported_stream`. pub fn new(kind: MatchErrorKind) -> MatchError { MatchError(alloc::boxed::Box::new(kind)) } /// Returns a reference to the underlying error kind. pub fn kind(&self) -> &MatchErrorKind { &self.0 } /// Create a new "invalid anchored search" error. This occurs when the /// caller requests an anchored search but where anchored searches aren't /// supported. /// /// This is the same as calling `MatchError::new` with a /// [`MatchErrorKind::InvalidInputAnchored`] kind. pub fn invalid_input_anchored() -> MatchError { MatchError::new(MatchErrorKind::InvalidInputAnchored) } /// Create a new "invalid unanchored search" error. This occurs when the /// caller requests an unanchored search but where unanchored searches /// aren't supported. /// /// This is the same as calling `MatchError::new` with a /// [`MatchErrorKind::InvalidInputUnanchored`] kind. pub fn invalid_input_unanchored() -> MatchError { MatchError::new(MatchErrorKind::InvalidInputUnanchored) } /// Create a new "unsupported stream search" error. This occurs when the /// caller requests a stream search while using an Aho-Corasick automaton /// with a match kind other than [`MatchKind::Standard`]. /// /// The match kind given should be the match kind of the automaton. It /// should never be `MatchKind::Standard`. pub fn unsupported_stream(got: MatchKind) -> MatchError { MatchError::new(MatchErrorKind::UnsupportedStream { got }) } /// Create a new "unsupported overlapping search" error. This occurs when /// the caller requests an overlapping search while using an Aho-Corasick /// automaton with a match kind other than [`MatchKind::Standard`]. /// /// The match kind given should be the match kind of the automaton. It /// should never be `MatchKind::Standard`. pub fn unsupported_overlapping(got: MatchKind) -> MatchError { MatchError::new(MatchErrorKind::UnsupportedOverlapping { got }) } /// Create a new "unsupported empty pattern" error. This occurs when the /// caller requests a search for which matching an automaton that contains /// an empty pattern string is not supported. pub fn unsupported_empty() -> MatchError { MatchError::new(MatchErrorKind::UnsupportedEmpty) } } /// The underlying kind of a [`MatchError`]. /// /// This is a **non-exhaustive** enum. That means new variants may be added in /// a semver-compatible release. #[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum MatchErrorKind { /// An error indicating that an anchored search was requested, but from a /// searcher that was built without anchored support. InvalidInputAnchored, /// An error indicating that an unanchored search was requested, but from a /// searcher that was built without unanchored support. InvalidInputUnanchored, /// An error indicating that a stream search was attempted on an /// Aho-Corasick automaton with an unsupported `MatchKind`. UnsupportedStream { /// The match semantics for the automaton that was used. got: MatchKind, }, /// An error indicating that an overlapping search was attempted on an /// Aho-Corasick automaton with an unsupported `MatchKind`. UnsupportedOverlapping { /// The match semantics for the automaton that was used. got: MatchKind, }, /// An error indicating that the operation requested doesn't support /// automatons that contain an empty pattern string. UnsupportedEmpty, } #[cfg(feature = "std")] impl std::error::Error for MatchError {} impl core::fmt::Display for MatchError { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { match *self.kind() { MatchErrorKind::InvalidInputAnchored => { write!(f, "anchored searches are not supported or enabled") } MatchErrorKind::InvalidInputUnanchored => { write!(f, "unanchored searches are not supported or enabled") } MatchErrorKind::UnsupportedStream { got } => { write!( f, "match kind {:?} does not support stream searching", got, ) } MatchErrorKind::UnsupportedOverlapping { got } => { write!( f, "match kind {:?} does not support overlapping searches", got, ) } MatchErrorKind::UnsupportedEmpty => { write!( f, "matching with an empty pattern string is not \ supported for this operation", ) } } } } aho-corasick-1.1.3/src/util/int.rs000064400000000000000000000141221046102023000150750ustar 00000000000000/*! This module provides several integer oriented traits for converting between both fixed size integers and integers whose size varies based on the target (like `usize`). The main design principle for this module is to centralize all uses of `as`. The thinking here is that `as` makes it very easy to perform accidental lossy conversions, and if we centralize all its uses here under more descriptive higher level operations, its use and correctness becomes easier to audit. This was copied mostly wholesale from `regex-automata`. NOTE: for simplicity, we don't take target pointer width into account here for `usize` conversions. Since we currently only panic in debug mode, skipping the check when it can be proven it isn't needed at compile time doesn't really matter. Now, if we wind up wanting to do as many checks as possible in release mode, then we would want to skip those when we know the conversions are always non-lossy. */ // We define a little more than what we need, but I'd rather just have // everything via a consistent and uniform API then have holes. #![allow(dead_code)] pub(crate) trait U8 { fn as_usize(self) -> usize; } impl U8 for u8 { fn as_usize(self) -> usize { usize::from(self) } } pub(crate) trait U16 { fn as_usize(self) -> usize; fn low_u8(self) -> u8; fn high_u8(self) -> u8; } impl U16 for u16 { fn as_usize(self) -> usize { usize::from(self) } fn low_u8(self) -> u8 { self as u8 } fn high_u8(self) -> u8 { (self >> 8) as u8 } } pub(crate) trait U32 { fn as_usize(self) -> usize; fn low_u8(self) -> u8; fn low_u16(self) -> u16; fn high_u16(self) -> u16; } impl U32 for u32 { #[inline] fn as_usize(self) -> usize { #[cfg(debug_assertions)] { usize::try_from(self).expect("u32 overflowed usize") } #[cfg(not(debug_assertions))] { self as usize } } fn low_u8(self) -> u8 { self as u8 } fn low_u16(self) -> u16 { self as u16 } fn high_u16(self) -> u16 { (self >> 16) as u16 } } pub(crate) trait U64 { fn as_usize(self) -> usize; fn low_u8(self) -> u8; fn low_u16(self) -> u16; fn low_u32(self) -> u32; fn high_u32(self) -> u32; } impl U64 for u64 { fn as_usize(self) -> usize { #[cfg(debug_assertions)] { usize::try_from(self).expect("u64 overflowed usize") } #[cfg(not(debug_assertions))] { self as usize } } fn low_u8(self) -> u8 { self as u8 } fn low_u16(self) -> u16 { self as u16 } fn low_u32(self) -> u32 { self as u32 } fn high_u32(self) -> u32 { (self >> 32) as u32 } } pub(crate) trait I8 { fn as_usize(self) -> usize; fn to_bits(self) -> u8; fn from_bits(n: u8) -> i8; } impl I8 for i8 { fn as_usize(self) -> usize { #[cfg(debug_assertions)] { usize::try_from(self).expect("i8 overflowed usize") } #[cfg(not(debug_assertions))] { self as usize } } fn to_bits(self) -> u8 { self as u8 } fn from_bits(n: u8) -> i8 { n as i8 } } pub(crate) trait I32 { fn as_usize(self) -> usize; fn to_bits(self) -> u32; fn from_bits(n: u32) -> i32; } impl I32 for i32 { fn as_usize(self) -> usize { #[cfg(debug_assertions)] { usize::try_from(self).expect("i32 overflowed usize") } #[cfg(not(debug_assertions))] { self as usize } } fn to_bits(self) -> u32 { self as u32 } fn from_bits(n: u32) -> i32 { n as i32 } } pub(crate) trait I64 { fn as_usize(self) -> usize; fn to_bits(self) -> u64; fn from_bits(n: u64) -> i64; } impl I64 for i64 { fn as_usize(self) -> usize { #[cfg(debug_assertions)] { usize::try_from(self).expect("i64 overflowed usize") } #[cfg(not(debug_assertions))] { self as usize } } fn to_bits(self) -> u64 { self as u64 } fn from_bits(n: u64) -> i64 { n as i64 } } pub(crate) trait Usize { fn as_u8(self) -> u8; fn as_u16(self) -> u16; fn as_u32(self) -> u32; fn as_u64(self) -> u64; } impl Usize for usize { fn as_u8(self) -> u8 { #[cfg(debug_assertions)] { u8::try_from(self).expect("usize overflowed u8") } #[cfg(not(debug_assertions))] { self as u8 } } fn as_u16(self) -> u16 { #[cfg(debug_assertions)] { u16::try_from(self).expect("usize overflowed u16") } #[cfg(not(debug_assertions))] { self as u16 } } fn as_u32(self) -> u32 { #[cfg(debug_assertions)] { u32::try_from(self).expect("usize overflowed u32") } #[cfg(not(debug_assertions))] { self as u32 } } fn as_u64(self) -> u64 { #[cfg(debug_assertions)] { u64::try_from(self).expect("usize overflowed u64") } #[cfg(not(debug_assertions))] { self as u64 } } } // Pointers aren't integers, but we convert pointers to integers to perform // offset arithmetic in some places. (And no, we don't convert the integers // back to pointers.) So add 'as_usize' conversions here too for completeness. // // These 'as' casts are actually okay because they're always non-lossy. But the // idea here is to just try and remove as much 'as' as possible, particularly // in this crate where we are being really paranoid about offsets and making // sure we don't panic on inputs that might be untrusted. This way, the 'as' // casts become easier to audit if they're all in one place, even when some of // them are actually okay 100% of the time. pub(crate) trait Pointer { fn as_usize(self) -> usize; } impl Pointer for *const T { fn as_usize(self) -> usize { self as usize } } aho-corasick-1.1.3/src/util/mod.rs000064400000000000000000000004461046102023000150660ustar 00000000000000pub(crate) mod alphabet; #[cfg(feature = "std")] pub(crate) mod buffer; pub(crate) mod byte_frequencies; pub(crate) mod debug; pub(crate) mod error; pub(crate) mod int; pub(crate) mod prefilter; pub(crate) mod primitives; pub(crate) mod remapper; pub(crate) mod search; pub(crate) mod special; aho-corasick-1.1.3/src/util/prefilter.rs000064400000000000000000001027631046102023000163100ustar 00000000000000use core::{ cmp, fmt::Debug, panic::{RefUnwindSafe, UnwindSafe}, u8, }; use alloc::{sync::Arc, vec, vec::Vec}; use crate::{ packed, util::{ alphabet::ByteSet, search::{Match, MatchKind, Span}, }, }; /// A prefilter for accelerating a search. /// /// This crate uses prefilters in the core search implementations to accelerate /// common cases. They typically only apply to cases where there are a small /// number of patterns (less than 100 or so), but when they do, thoughput can /// be boosted considerably, perhaps by an order of magnitude. When a prefilter /// is active, it is used whenever a search enters an automaton's start state. /// /// Currently, prefilters cannot be constructed by /// callers. A `Prefilter` can only be accessed via the /// [`Automaton::prefilter`](crate::automaton::Automaton::prefilter) /// method and used to execute a search. In other words, a prefilter can be /// used to optimize your own search implementation if necessary, but cannot do /// much else. If you have a use case for more APIs, please submit an issue. #[derive(Clone, Debug)] pub struct Prefilter { finder: Arc, memory_usage: usize, } impl Prefilter { /// Execute a search in the haystack within the span given. If a match or /// a possible match is returned, then it is guaranteed to occur within /// the bounds of the span. /// /// If the span provided is invalid for the given haystack, then behavior /// is unspecified. #[inline] pub fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { self.finder.find_in(haystack, span) } #[inline] pub(crate) fn memory_usage(&self) -> usize { self.memory_usage } } /// A candidate is the result of running a prefilter on a haystack at a /// particular position. /// /// The result is either no match, a confirmed match or a possible match. /// /// When no match is returned, the prefilter is guaranteeing that no possible /// match can be found in the haystack, and the caller may trust this. That is, /// all correct prefilters must never report false negatives. /// /// In some cases, a prefilter can confirm a match very quickly, in which case, /// the caller may use this to stop what it's doing and report the match. In /// this case, prefilter implementations must never report a false positive. /// In other cases, the prefilter can only report a potential match, in which /// case the callers must attempt to confirm the match. In this case, prefilter /// implementations are permitted to return false positives. #[derive(Clone, Debug)] pub enum Candidate { /// No match was found. Since false negatives are not possible, this means /// the search can quit as it is guaranteed not to find another match. None, /// A confirmed match was found. Callers do not need to confirm it. Match(Match), /// The start of a possible match was found. Callers must confirm it before /// reporting it as a match. PossibleStartOfMatch(usize), } impl Candidate { /// Convert this candidate into an option. This is useful when callers /// do not distinguish between true positives and false positives (i.e., /// the caller must always confirm the match). pub fn into_option(self) -> Option { match self { Candidate::None => None, Candidate::Match(ref m) => Some(m.start()), Candidate::PossibleStartOfMatch(start) => Some(start), } } } /// A prefilter describes the behavior of fast literal scanners for quickly /// skipping past bytes in the haystack that we know cannot possibly /// participate in a match. trait PrefilterI: Send + Sync + RefUnwindSafe + UnwindSafe + Debug + 'static { /// Returns the next possible match candidate. This may yield false /// positives, so callers must confirm a match starting at the position /// returned. This, however, must never produce false negatives. That is, /// this must, at minimum, return the starting position of the next match /// in the given haystack after or at the given position. fn find_in(&self, haystack: &[u8], span: Span) -> Candidate; } impl PrefilterI for Arc

{ #[inline(always)] fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { (**self).find_in(haystack, span) } } /// A builder for constructing the best possible prefilter. When constructed, /// this builder will heuristically select the best prefilter it can build, /// if any, and discard the rest. #[derive(Debug)] pub(crate) struct Builder { count: usize, ascii_case_insensitive: bool, start_bytes: StartBytesBuilder, rare_bytes: RareBytesBuilder, memmem: MemmemBuilder, packed: Option, // If we run across a condition that suggests we shouldn't use a prefilter // at all (like an empty pattern), then disable prefilters entirely. enabled: bool, } impl Builder { /// Create a new builder for constructing the best possible prefilter. pub(crate) fn new(kind: MatchKind) -> Builder { let pbuilder = kind .as_packed() .map(|kind| packed::Config::new().match_kind(kind).builder()); Builder { count: 0, ascii_case_insensitive: false, start_bytes: StartBytesBuilder::new(), rare_bytes: RareBytesBuilder::new(), memmem: MemmemBuilder::default(), packed: pbuilder, enabled: true, } } /// Enable ASCII case insensitivity. When set, byte strings added to this /// builder will be interpreted without respect to ASCII case. pub(crate) fn ascii_case_insensitive(mut self, yes: bool) -> Builder { self.ascii_case_insensitive = yes; self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); self } /// Return a prefilter suitable for quickly finding potential matches. /// /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. pub(crate) fn build(&self) -> Option { if !self.enabled { debug!("prefilter not enabled, skipping"); return None; } // If we only have one pattern, then deferring to memmem is always // the best choice. This is kind of a weird case, because, well, why // use Aho-Corasick if you only have one pattern? But maybe you don't // know exactly how many patterns you'll get up front, and you need to // support the option of multiple patterns. So instead of relying on // the caller to branch and use memmem explicitly, we just do it for // them. if !self.ascii_case_insensitive { if let Some(pre) = self.memmem.build() { debug!("using memmem prefilter"); return Some(pre); } } let (packed, patlen, minlen) = if self.ascii_case_insensitive { (None, usize::MAX, 0) } else { let patlen = self.packed.as_ref().map_or(usize::MAX, |p| p.len()); let minlen = self.packed.as_ref().map_or(0, |p| p.minimum_len()); let packed = self.packed.as_ref().and_then(|b| b.build()).map(|s| { let memory_usage = s.memory_usage(); debug!( "built packed prefilter (len: {}, \ minimum pattern len: {}, memory usage: {}) \ for consideration", patlen, minlen, memory_usage, ); Prefilter { finder: Arc::new(Packed(s)), memory_usage } }); (packed, patlen, minlen) }; match (self.start_bytes.build(), self.rare_bytes.build()) { // If we could build both start and rare prefilters, then there are // a few cases in which we'd want to use the start-byte prefilter // over the rare-byte prefilter, since the former has lower // overhead. (prestart @ Some(_), prerare @ Some(_)) => { debug!( "both start (len={}, rank={}) and \ rare (len={}, rank={}) byte prefilters \ are available", self.start_bytes.count, self.start_bytes.rank_sum, self.rare_bytes.count, self.rare_bytes.rank_sum, ); if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 && self.rare_bytes.count >= 3 { debug!( "start and rare byte prefilters available, but \ they're probably slower than packed so using \ packed" ); return packed; } // If the start-byte prefilter can scan for a smaller number // of bytes than the rare-byte prefilter, then it's probably // faster. let has_fewer_bytes = self.start_bytes.count < self.rare_bytes.count; // Otherwise, if the combined frequency rank of the detected // bytes in the start-byte prefilter is "close" to the combined // frequency rank of the rare-byte prefilter, then we pick // the start-byte prefilter even if the rare-byte prefilter // heuristically searches for rare bytes. This is because the // rare-byte prefilter has higher constant costs, so we tend to // prefer the start-byte prefilter when we can. let has_rarer_bytes = self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; if has_fewer_bytes { debug!( "using start byte prefilter because it has fewer bytes to search for than the rare byte prefilter", ); prestart } else if has_rarer_bytes { debug!( "using start byte prefilter because its byte \ frequency rank was determined to be \ \"good enough\" relative to the rare byte prefilter \ byte frequency rank", ); prestart } else { debug!("using rare byte prefilter"); prerare } } (prestart @ Some(_), None) => { if patlen <= 16 && minlen >= 2 && self.start_bytes.count >= 3 { debug!( "start byte prefilter available, but \ it's probably slower than packed so using \ packed" ); return packed; } debug!( "have start byte prefilter but not rare byte prefilter, \ so using start byte prefilter", ); prestart } (None, prerare @ Some(_)) => { if patlen <= 16 && minlen >= 2 && self.rare_bytes.count >= 3 { debug!( "rare byte prefilter available, but \ it's probably slower than packed so using \ packed" ); return packed; } debug!( "have rare byte prefilter but not start byte prefilter, \ so using rare byte prefilter", ); prerare } (None, None) if self.ascii_case_insensitive => { debug!( "no start or rare byte prefilter and ASCII case \ insensitivity was enabled, so skipping prefilter", ); None } (None, None) => { if packed.is_some() { debug!("falling back to packed prefilter"); } else { debug!("no prefilter available"); } packed } } } /// Add a literal string to this prefilter builder. pub(crate) fn add(&mut self, bytes: &[u8]) { if bytes.is_empty() { self.enabled = false; } if !self.enabled { return; } self.count += 1; self.start_bytes.add(bytes); self.rare_bytes.add(bytes); self.memmem.add(bytes); if let Some(ref mut pbuilder) = self.packed { pbuilder.add(bytes); } } } /// A type that wraps a packed searcher and implements the `Prefilter` /// interface. #[derive(Clone, Debug)] struct Packed(packed::Searcher); impl PrefilterI for Packed { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { self.0 .find_in(&haystack, span) .map_or(Candidate::None, Candidate::Match) } } /// A builder for constructing a prefilter that uses memmem. #[derive(Debug, Default)] struct MemmemBuilder { /// The number of patterns that have been added. count: usize, /// The singular pattern to search for. This is only set when count==1. one: Option>, } impl MemmemBuilder { fn build(&self) -> Option { #[cfg(all(feature = "std", feature = "perf-literal"))] fn imp(builder: &MemmemBuilder) -> Option { let pattern = builder.one.as_ref()?; assert_eq!(1, builder.count); let finder = Arc::new(Memmem( memchr::memmem::Finder::new(pattern).into_owned(), )); let memory_usage = pattern.len(); Some(Prefilter { finder, memory_usage }) } #[cfg(not(all(feature = "std", feature = "perf-literal")))] fn imp(_: &MemmemBuilder) -> Option { None } imp(self) } fn add(&mut self, bytes: &[u8]) { self.count += 1; if self.count == 1 { self.one = Some(bytes.to_vec()); } else { self.one = None; } } } /// A type that wraps a SIMD accelerated single substring search from the /// `memchr` crate for use as a prefilter. /// /// Currently, this prefilter is only active for Aho-Corasick searchers with /// a single pattern. In theory, this could be extended to support searchers /// that have a common prefix of more than one byte (for one byte, we would use /// memchr), but it's not clear if it's worth it or not. /// /// Also, unfortunately, this currently also requires the 'std' feature to /// be enabled. That's because memchr doesn't have a no-std-but-with-alloc /// mode, and so APIs like Finder::into_owned aren't available when 'std' is /// disabled. But there should be an 'alloc' feature that brings in APIs like /// Finder::into_owned but doesn't use std-only features like runtime CPU /// feature detection. #[cfg(all(feature = "std", feature = "perf-literal"))] #[derive(Clone, Debug)] struct Memmem(memchr::memmem::Finder<'static>); #[cfg(all(feature = "std", feature = "perf-literal"))] impl PrefilterI for Memmem { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { use crate::util::primitives::PatternID; self.0.find(&haystack[span]).map_or(Candidate::None, |i| { let start = span.start + i; let end = start + self.0.needle().len(); // N.B. We can declare a match and use a fixed pattern ID here // because a Memmem prefilter is only ever created for searchers // with exactly one pattern. Thus, every match is always a match // and it is always for the first and only pattern. Candidate::Match(Match::new(PatternID::ZERO, start..end)) }) } } /// A builder for constructing a rare byte prefilter. /// /// A rare byte prefilter attempts to pick out a small set of rare bytes that /// occurr in the patterns, and then quickly scan to matches of those rare /// bytes. #[derive(Clone, Debug)] struct RareBytesBuilder { /// Whether this prefilter should account for ASCII case insensitivity or /// not. ascii_case_insensitive: bool, /// A set of rare bytes, indexed by byte value. rare_set: ByteSet, /// A set of byte offsets associated with bytes in a pattern. An entry /// corresponds to a particular bytes (its index) and is only non-zero if /// the byte occurred at an offset greater than 0 in at least one pattern. /// /// If a byte's offset is not representable in 8 bits, then the rare bytes /// prefilter becomes inert. byte_offsets: RareByteOffsets, /// Whether this is available as a prefilter or not. This can be set to /// false during construction if a condition is seen that invalidates the /// use of the rare-byte prefilter. available: bool, /// The number of bytes set to an active value in `byte_offsets`. count: usize, /// The sum of frequency ranks for the rare bytes detected. This is /// intended to give a heuristic notion of how rare the bytes are. rank_sum: u16, } /// A set of byte offsets, keyed by byte. #[derive(Clone, Copy)] struct RareByteOffsets { /// Each entry corresponds to the maximum offset of the corresponding /// byte across all patterns seen. set: [RareByteOffset; 256], } impl RareByteOffsets { /// Create a new empty set of rare byte offsets. pub(crate) fn empty() -> RareByteOffsets { RareByteOffsets { set: [RareByteOffset::default(); 256] } } /// Add the given offset for the given byte to this set. If the offset is /// greater than the existing offset, then it overwrites the previous /// value and returns false. If there is no previous value set, then this /// sets it and returns true. pub(crate) fn set(&mut self, byte: u8, off: RareByteOffset) { self.set[byte as usize].max = cmp::max(self.set[byte as usize].max, off.max); } } impl core::fmt::Debug for RareByteOffsets { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let mut offsets = vec![]; for off in self.set.iter() { if off.max > 0 { offsets.push(off); } } f.debug_struct("RareByteOffsets").field("set", &offsets).finish() } } /// Offsets associated with an occurrence of a "rare" byte in any of the /// patterns used to construct a single Aho-Corasick automaton. #[derive(Clone, Copy, Debug)] struct RareByteOffset { /// The maximum offset at which a particular byte occurs from the start /// of any pattern. This is used as a shift amount. That is, when an /// occurrence of this byte is found, the candidate position reported by /// the prefilter is `position_of_byte - max`, such that the automaton /// will begin its search at a position that is guaranteed to observe a /// match. /// /// To avoid accidentally quadratic behavior, a prefilter is considered /// ineffective when it is asked to start scanning from a position that it /// has already scanned past. /// /// Using a `u8` here means that if we ever see a pattern that's longer /// than 255 bytes, then the entire rare byte prefilter is disabled. max: u8, } impl Default for RareByteOffset { fn default() -> RareByteOffset { RareByteOffset { max: 0 } } } impl RareByteOffset { /// Create a new rare byte offset. If the given offset is too big, then /// None is returned. In that case, callers should render the rare bytes /// prefilter inert. fn new(max: usize) -> Option { if max > u8::MAX as usize { None } else { Some(RareByteOffset { max: max as u8 }) } } } impl RareBytesBuilder { /// Create a new builder for constructing a rare byte prefilter. fn new() -> RareBytesBuilder { RareBytesBuilder { ascii_case_insensitive: false, rare_set: ByteSet::empty(), byte_offsets: RareByteOffsets::empty(), available: true, count: 0, rank_sum: 0, } } /// Enable ASCII case insensitivity. When set, byte strings added to this /// builder will be interpreted without respect to ASCII case. fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { self.ascii_case_insensitive = yes; self } /// Build the rare bytes prefilter. /// /// If there are more than 3 distinct rare bytes found, or if heuristics /// otherwise determine that this prefilter should not be used, then `None` /// is returned. fn build(&self) -> Option { #[cfg(feature = "perf-literal")] fn imp(builder: &RareBytesBuilder) -> Option { if !builder.available || builder.count > 3 { return None; } let (mut bytes, mut len) = ([0; 3], 0); for b in 0..=255 { if builder.rare_set.contains(b) { bytes[len] = b as u8; len += 1; } } let finder: Arc = match len { 0 => return None, 1 => Arc::new(RareBytesOne { byte1: bytes[0], offset: builder.byte_offsets.set[bytes[0] as usize], }), 2 => Arc::new(RareBytesTwo { offsets: builder.byte_offsets, byte1: bytes[0], byte2: bytes[1], }), 3 => Arc::new(RareBytesThree { offsets: builder.byte_offsets, byte1: bytes[0], byte2: bytes[1], byte3: bytes[2], }), _ => unreachable!(), }; Some(Prefilter { finder, memory_usage: 0 }) } #[cfg(not(feature = "perf-literal"))] fn imp(_: &RareBytesBuilder) -> Option { None } imp(self) } /// Add a byte string to this builder. /// /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. fn add(&mut self, bytes: &[u8]) { // If we've already given up, then do nothing. if !self.available { return; } // If we've already blown our budget, then don't waste time looking // for more rare bytes. if self.count > 3 { self.available = false; return; } // If the pattern is too long, then our offset table is bunk, so // give up. if bytes.len() >= 256 { self.available = false; return; } let mut rarest = match bytes.get(0) { None => return, Some(&b) => (b, freq_rank(b)), }; // The idea here is to look for the rarest byte in each pattern, and // add that to our set. As a special exception, if we see a byte that // we've already added, then we immediately stop and choose that byte, // even if there's another rare byte in the pattern. This helps us // apply the rare byte optimization in more cases by attempting to pick // bytes that are in common between patterns. So for example, if we // were searching for `Sherlock` and `lockjaw`, then this would pick // `k` for both patterns, resulting in the use of `memchr` instead of // `memchr2` for `k` and `j`. let mut found = false; for (pos, &b) in bytes.iter().enumerate() { self.set_offset(pos, b); if found { continue; } if self.rare_set.contains(b) { found = true; continue; } let rank = freq_rank(b); if rank < rarest.1 { rarest = (b, rank); } } if !found { self.add_rare_byte(rarest.0); } } fn set_offset(&mut self, pos: usize, byte: u8) { // This unwrap is OK because pos is never bigger than our max. let offset = RareByteOffset::new(pos).unwrap(); self.byte_offsets.set(byte, offset); if self.ascii_case_insensitive { self.byte_offsets.set(opposite_ascii_case(byte), offset); } } fn add_rare_byte(&mut self, byte: u8) { self.add_one_rare_byte(byte); if self.ascii_case_insensitive { self.add_one_rare_byte(opposite_ascii_case(byte)); } } fn add_one_rare_byte(&mut self, byte: u8) { if !self.rare_set.contains(byte) { self.rare_set.add(byte); self.count += 1; self.rank_sum += freq_rank(byte) as u16; } } } /// A prefilter for scanning for a single "rare" byte. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct RareBytesOne { byte1: u8, offset: RareByteOffset, } #[cfg(feature = "perf-literal")] impl PrefilterI for RareBytesOne { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr(self.byte1, &haystack[span]) .map(|i| { let pos = span.start + i; cmp::max( span.start, pos.saturating_sub(usize::from(self.offset.max)), ) }) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A prefilter for scanning for two "rare" bytes. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct RareBytesTwo { offsets: RareByteOffsets, byte1: u8, byte2: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for RareBytesTwo { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr2(self.byte1, self.byte2, &haystack[span]) .map(|i| { let pos = span.start + i; let offset = self.offsets.set[usize::from(haystack[pos])].max; cmp::max(span.start, pos.saturating_sub(usize::from(offset))) }) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A prefilter for scanning for three "rare" bytes. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct RareBytesThree { offsets: RareByteOffsets, byte1: u8, byte2: u8, byte3: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for RareBytesThree { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) .map(|i| { let pos = span.start + i; let offset = self.offsets.set[usize::from(haystack[pos])].max; cmp::max(span.start, pos.saturating_sub(usize::from(offset))) }) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A builder for constructing a starting byte prefilter. /// /// A starting byte prefilter is a simplistic prefilter that looks for possible /// matches by reporting all positions corresponding to a particular byte. This /// generally only takes affect when there are at most 3 distinct possible /// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two /// distinct starting bytes (`f` and `b`), and this prefilter returns all /// occurrences of either `f` or `b`. /// /// In some cases, a heuristic frequency analysis may determine that it would /// be better not to use this prefilter even when there are 3 or fewer distinct /// starting bytes. #[derive(Clone, Debug)] struct StartBytesBuilder { /// Whether this prefilter should account for ASCII case insensitivity or /// not. ascii_case_insensitive: bool, /// The set of starting bytes observed. byteset: Vec, /// The number of bytes set to true in `byteset`. count: usize, /// The sum of frequency ranks for the rare bytes detected. This is /// intended to give a heuristic notion of how rare the bytes are. rank_sum: u16, } impl StartBytesBuilder { /// Create a new builder for constructing a start byte prefilter. fn new() -> StartBytesBuilder { StartBytesBuilder { ascii_case_insensitive: false, byteset: vec![false; 256], count: 0, rank_sum: 0, } } /// Enable ASCII case insensitivity. When set, byte strings added to this /// builder will be interpreted without respect to ASCII case. fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { self.ascii_case_insensitive = yes; self } /// Build the starting bytes prefilter. /// /// If there are more than 3 distinct starting bytes, or if heuristics /// otherwise determine that this prefilter should not be used, then `None` /// is returned. fn build(&self) -> Option { #[cfg(feature = "perf-literal")] fn imp(builder: &StartBytesBuilder) -> Option { if builder.count > 3 { return None; } let (mut bytes, mut len) = ([0; 3], 0); for b in 0..256 { if !builder.byteset[b] { continue; } // We don't handle non-ASCII bytes for now. Getting non-ASCII // bytes right is trickier, since we generally don't want to put // a leading UTF-8 code unit into a prefilter that isn't ASCII, // since they can frequently. Instead, it would be better to use a // continuation byte, but this requires more sophisticated analysis // of the automaton and a richer prefilter API. if b > 0x7F { return None; } bytes[len] = b as u8; len += 1; } let finder: Arc = match len { 0 => return None, 1 => Arc::new(StartBytesOne { byte1: bytes[0] }), 2 => Arc::new(StartBytesTwo { byte1: bytes[0], byte2: bytes[1], }), 3 => Arc::new(StartBytesThree { byte1: bytes[0], byte2: bytes[1], byte3: bytes[2], }), _ => unreachable!(), }; Some(Prefilter { finder, memory_usage: 0 }) } #[cfg(not(feature = "perf-literal"))] fn imp(_: &StartBytesBuilder) -> Option { None } imp(self) } /// Add a byte string to this builder. /// /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. fn add(&mut self, bytes: &[u8]) { if self.count > 3 { return; } if let Some(&byte) = bytes.get(0) { self.add_one_byte(byte); if self.ascii_case_insensitive { self.add_one_byte(opposite_ascii_case(byte)); } } } fn add_one_byte(&mut self, byte: u8) { if !self.byteset[byte as usize] { self.byteset[byte as usize] = true; self.count += 1; self.rank_sum += freq_rank(byte) as u16; } } } /// A prefilter for scanning for a single starting byte. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct StartBytesOne { byte1: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for StartBytesOne { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr(self.byte1, &haystack[span]) .map(|i| span.start + i) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A prefilter for scanning for two starting bytes. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct StartBytesTwo { byte1: u8, byte2: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for StartBytesTwo { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr2(self.byte1, self.byte2, &haystack[span]) .map(|i| span.start + i) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// A prefilter for scanning for three starting bytes. #[cfg(feature = "perf-literal")] #[derive(Clone, Debug)] struct StartBytesThree { byte1: u8, byte2: u8, byte3: u8, } #[cfg(feature = "perf-literal")] impl PrefilterI for StartBytesThree { fn find_in(&self, haystack: &[u8], span: Span) -> Candidate { memchr::memchr3(self.byte1, self.byte2, self.byte3, &haystack[span]) .map(|i| span.start + i) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } } /// If the given byte is an ASCII letter, then return it in the opposite case. /// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns /// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. pub(crate) fn opposite_ascii_case(b: u8) -> u8 { if b'A' <= b && b <= b'Z' { b.to_ascii_lowercase() } else if b'a' <= b && b <= b'z' { b.to_ascii_uppercase() } else { b } } /// Return the frequency rank of the given byte. The higher the rank, the more /// common the byte (heuristically speaking). fn freq_rank(b: u8) -> u8 { use crate::util::byte_frequencies::BYTE_FREQUENCIES; BYTE_FREQUENCIES[b as usize] } aho-corasick-1.1.3/src/util/primitives.rs000064400000000000000000000651141046102023000165050ustar 00000000000000/*! Lower level primitive types that are useful in a variety of circumstances. # Overview This list represents the principle types in this module and briefly describes when you might want to use them. * [`PatternID`] - A type that represents the identifier of a regex pattern. This is probably the most widely used type in this module (which is why it's also re-exported in the crate root). * [`StateID`] - A type the represents the identifier of a finite automaton state. This is used for both NFAs and DFAs, with the notable exception of the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state identifier.) * [`SmallIndex`] - The internal representation of both a `PatternID` and a `StateID`. Its purpose is to serve as a type that can index memory without being as big as a `usize` on 64-bit targets. The main idea behind this type is that there are many things in regex engines that will, in practice, never overflow a 32-bit integer. (For example, like the number of patterns in a regex or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index memory without peppering `as` casts everywhere. Moreover, it forces callers to handle errors in the case where, somehow, the value would otherwise overflow either a 32-bit integer or a `usize` (e.g., on 16-bit targets). */ // The macro we use to define some types below adds methods that we don't // use on some of the types. There isn't much, so we just squash the warning. #![allow(dead_code)] use alloc::vec::Vec; use crate::util::int::{Usize, U16, U32, U64}; /// A type that represents a "small" index. /// /// The main idea of this type is to provide something that can index memory, /// but uses less memory than `usize` on 64-bit systems. Specifically, its /// representation is always a `u32` and has `repr(transparent)` enabled. (So /// it is safe to transmute between a `u32` and a `SmallIndex`.) /// /// A small index is typically useful in cases where there is no practical way /// that the index will overflow a 32-bit integer. A good example of this is /// an NFA state. If you could somehow build an NFA with `2^30` states, its /// memory usage would be exorbitant and its runtime execution would be so /// slow as to be completely worthless. Therefore, this crate generally deems /// it acceptable to return an error if it would otherwise build an NFA that /// requires a slice longer than what a 32-bit integer can index. In exchange, /// we can use 32-bit indices instead of 64-bit indices in various places. /// /// This type ensures this by providing a constructor that will return an error /// if its argument cannot fit into the type. This makes it much easier to /// handle these sorts of boundary cases that are otherwise extremely subtle. /// /// On all targets, this type guarantees that its value will fit in a `u32`, /// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for /// example, this type's maximum value will never overflow an `isize`, /// which means it will never overflow a `i16` even though its internal /// representation is still a `u32`. /// /// The purpose for making the type fit into even signed integer types like /// `isize` is to guarantee that the difference between any two small indices /// is itself also a small index. This is useful in certain contexts, e.g., /// for delta encoding. /// /// # Other types /// /// The following types wrap `SmallIndex` to provide a more focused use case: /// /// * [`PatternID`] is for representing the identifiers of patterns. /// * [`StateID`] is for representing the identifiers of states in finite /// automata. It is used for both NFAs and DFAs. /// /// # Representation /// /// This type is always represented internally by a `u32` and is marked as /// `repr(transparent)`. Thus, this type always has the same representation as /// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`. /// /// # Indexing /// /// For convenience, callers may use a `SmallIndex` to index slices. /// /// # Safety /// /// While a `SmallIndex` is meant to guarantee that its value fits into `usize` /// without using as much space as a `usize` on all targets, callers must /// not rely on this property for safety. Callers may choose to rely on this /// property for correctness however. For example, creating a `SmallIndex` with /// an invalid value can be done in entirely safe code. This may in turn result /// in panics or silent logical errors. #[derive( Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, )] #[repr(transparent)] pub(crate) struct SmallIndex(u32); impl SmallIndex { /// The maximum index value. #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] pub const MAX: SmallIndex = // FIXME: Use as_usize() once const functions in traits are stable. SmallIndex::new_unchecked(core::i32::MAX as usize - 1); /// The maximum index value. #[cfg(target_pointer_width = "16")] pub const MAX: SmallIndex = SmallIndex::new_unchecked(core::isize::MAX - 1); /// The total number of values that can be represented as a small index. pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1; /// The zero index value. pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0); /// The number of bytes that a single small index uses in memory. pub const SIZE: usize = core::mem::size_of::(); /// Create a new small index. /// /// If the given index exceeds [`SmallIndex::MAX`], then this returns /// an error. #[inline] pub fn new(index: usize) -> Result { SmallIndex::try_from(index) } /// Create a new small index without checking whether the given value /// exceeds [`SmallIndex::MAX`]. /// /// Using this routine with an invalid index value will result in /// unspecified behavior, but *not* undefined behavior. In particular, an /// invalid index value is likely to cause panics or possibly even silent /// logical errors. /// /// Callers must never rely on a `SmallIndex` to be within a certain range /// for memory safety. #[inline] pub const fn new_unchecked(index: usize) -> SmallIndex { // FIXME: Use as_u32() once const functions in traits are stable. SmallIndex::from_u32_unchecked(index as u32) } /// Create a new small index from a `u32` without checking whether the /// given value exceeds [`SmallIndex::MAX`]. /// /// Using this routine with an invalid index value will result in /// unspecified behavior, but *not* undefined behavior. In particular, an /// invalid index value is likely to cause panics or possibly even silent /// logical errors. /// /// Callers must never rely on a `SmallIndex` to be within a certain range /// for memory safety. #[inline] pub const fn from_u32_unchecked(index: u32) -> SmallIndex { SmallIndex(index) } /// Like [`SmallIndex::new`], but panics if the given index is not valid. #[inline] pub fn must(index: usize) -> SmallIndex { SmallIndex::new(index).expect("invalid small index") } /// Return this small index as a `usize`. This is guaranteed to never /// overflow `usize`. #[inline] pub const fn as_usize(&self) -> usize { // FIXME: Use as_usize() once const functions in traits are stable. self.0 as usize } /// Return this small index as a `u64`. This is guaranteed to never /// overflow. #[inline] pub const fn as_u64(&self) -> u64 { // FIXME: Use u64::from() once const functions in traits are stable. self.0 as u64 } /// Return the internal `u32` of this small index. This is guaranteed to /// never overflow `u32`. #[inline] pub const fn as_u32(&self) -> u32 { self.0 } /// Return the internal `u32` of this small index represented as an `i32`. /// This is guaranteed to never overflow an `i32`. #[inline] pub const fn as_i32(&self) -> i32 { // This is OK because we guarantee that our max value is <= i32::MAX. self.0 as i32 } /// Returns one more than this small index as a usize. /// /// Since a small index has constraints on its maximum value, adding `1` to /// it will always fit in a `usize`, `isize`, `u32` and a `i32`. #[inline] pub fn one_more(&self) -> usize { self.as_usize() + 1 } /// Decode this small index from the bytes given using the native endian /// byte order for the current target. /// /// If the decoded integer is not representable as a small index for the /// current target, then this returns an error. #[inline] pub fn from_ne_bytes( bytes: [u8; 4], ) -> Result { let id = u32::from_ne_bytes(bytes); if id > SmallIndex::MAX.as_u32() { return Err(SmallIndexError { attempted: u64::from(id) }); } Ok(SmallIndex::new_unchecked(id.as_usize())) } /// Decode this small index from the bytes given using the native endian /// byte order for the current target. /// /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not /// check whether the decoded integer is representable as a small index. #[inline] pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex { SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize()) } /// Return the underlying small index integer as raw bytes in native endian /// format. #[inline] pub fn to_ne_bytes(&self) -> [u8; 4] { self.0.to_ne_bytes() } } impl core::ops::Index for [T] { type Output = T; #[inline] fn index(&self, index: SmallIndex) -> &T { &self[index.as_usize()] } } impl core::ops::IndexMut for [T] { #[inline] fn index_mut(&mut self, index: SmallIndex) -> &mut T { &mut self[index.as_usize()] } } impl core::ops::Index for Vec { type Output = T; #[inline] fn index(&self, index: SmallIndex) -> &T { &self[index.as_usize()] } } impl core::ops::IndexMut for Vec { #[inline] fn index_mut(&mut self, index: SmallIndex) -> &mut T { &mut self[index.as_usize()] } } impl From for SmallIndex { fn from(sid: StateID) -> SmallIndex { sid.0 } } impl From for SmallIndex { fn from(pid: PatternID) -> SmallIndex { pid.0 } } impl From for SmallIndex { fn from(index: u8) -> SmallIndex { SmallIndex::new_unchecked(usize::from(index)) } } impl TryFrom for SmallIndex { type Error = SmallIndexError; fn try_from(index: u16) -> Result { if u32::from(index) > SmallIndex::MAX.as_u32() { return Err(SmallIndexError { attempted: u64::from(index) }); } Ok(SmallIndex::new_unchecked(index.as_usize())) } } impl TryFrom for SmallIndex { type Error = SmallIndexError; fn try_from(index: u32) -> Result { if index > SmallIndex::MAX.as_u32() { return Err(SmallIndexError { attempted: u64::from(index) }); } Ok(SmallIndex::new_unchecked(index.as_usize())) } } impl TryFrom for SmallIndex { type Error = SmallIndexError; fn try_from(index: u64) -> Result { if index > SmallIndex::MAX.as_u64() { return Err(SmallIndexError { attempted: index }); } Ok(SmallIndex::new_unchecked(index.as_usize())) } } impl TryFrom for SmallIndex { type Error = SmallIndexError; fn try_from(index: usize) -> Result { if index > SmallIndex::MAX.as_usize() { return Err(SmallIndexError { attempted: index.as_u64() }); } Ok(SmallIndex::new_unchecked(index)) } } /// This error occurs when a small index could not be constructed. /// /// This occurs when given an integer exceeding the maximum small index value. /// /// When the `std` feature is enabled, this implements the `Error` trait. #[derive(Clone, Debug, Eq, PartialEq)] pub struct SmallIndexError { attempted: u64, } impl SmallIndexError { /// Returns the value that could not be converted to a small index. pub fn attempted(&self) -> u64 { self.attempted } } #[cfg(feature = "std")] impl std::error::Error for SmallIndexError {} impl core::fmt::Display for SmallIndexError { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "failed to create small index from {:?}, which exceeds {:?}", self.attempted(), SmallIndex::MAX, ) } } #[derive(Clone, Debug)] pub(crate) struct SmallIndexIter { rng: core::ops::Range, } impl Iterator for SmallIndexIter { type Item = SmallIndex; fn next(&mut self) -> Option { if self.rng.start >= self.rng.end { return None; } let next_id = self.rng.start + 1; let id = core::mem::replace(&mut self.rng.start, next_id); // new_unchecked is OK since we asserted that the number of // elements in this iterator will fit in an ID at construction. Some(SmallIndex::new_unchecked(id)) } } macro_rules! index_type_impls { ($name:ident, $err:ident, $iter:ident, $withiter:ident) => { impl $name { /// The maximum value. pub const MAX: $name = $name(SmallIndex::MAX); /// The total number of values that can be represented. pub const LIMIT: usize = SmallIndex::LIMIT; /// The zero value. pub const ZERO: $name = $name(SmallIndex::ZERO); /// The number of bytes that a single value uses in memory. pub const SIZE: usize = SmallIndex::SIZE; /// Create a new value that is represented by a "small index." /// /// If the given index exceeds the maximum allowed value, then this /// returns an error. #[inline] pub fn new(value: usize) -> Result<$name, $err> { SmallIndex::new(value).map($name).map_err($err) } /// Create a new value without checking whether the given argument /// exceeds the maximum. /// /// Using this routine with an invalid value will result in /// unspecified behavior, but *not* undefined behavior. In /// particular, an invalid ID value is likely to cause panics or /// possibly even silent logical errors. /// /// Callers must never rely on this type to be within a certain /// range for memory safety. #[inline] pub const fn new_unchecked(value: usize) -> $name { $name(SmallIndex::new_unchecked(value)) } /// Create a new value from a `u32` without checking whether the /// given value exceeds the maximum. /// /// Using this routine with an invalid value will result in /// unspecified behavior, but *not* undefined behavior. In /// particular, an invalid ID value is likely to cause panics or /// possibly even silent logical errors. /// /// Callers must never rely on this type to be within a certain /// range for memory safety. #[inline] pub const fn from_u32_unchecked(index: u32) -> $name { $name(SmallIndex::from_u32_unchecked(index)) } /// Like `new`, but panics if the given value is not valid. #[inline] pub fn must(value: usize) -> $name { $name::new(value).expect(concat!( "invalid ", stringify!($name), " value" )) } /// Return the internal value as a `usize`. This is guaranteed to /// never overflow `usize`. #[inline] pub const fn as_usize(&self) -> usize { self.0.as_usize() } /// Return the internal value as a `u64`. This is guaranteed to /// never overflow. #[inline] pub const fn as_u64(&self) -> u64 { self.0.as_u64() } /// Return the internal value as a `u32`. This is guaranteed to /// never overflow `u32`. #[inline] pub const fn as_u32(&self) -> u32 { self.0.as_u32() } /// Return the internal value as a `i32`. This is guaranteed to /// never overflow an `i32`. #[inline] pub const fn as_i32(&self) -> i32 { self.0.as_i32() } /// Returns one more than this value as a usize. /// /// Since values represented by a "small index" have constraints /// on their maximum value, adding `1` to it will always fit in a /// `usize`, `u32` and a `i32`. #[inline] pub fn one_more(&self) -> usize { self.0.one_more() } /// Decode this value from the bytes given using the native endian /// byte order for the current target. /// /// If the decoded integer is not representable as a small index /// for the current target, then this returns an error. #[inline] pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> { SmallIndex::from_ne_bytes(bytes).map($name).map_err($err) } /// Decode this value from the bytes given using the native endian /// byte order for the current target. /// /// This is analogous to `new_unchecked` in that is does not check /// whether the decoded integer is representable as a small index. #[inline] pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name { $name(SmallIndex::from_ne_bytes_unchecked(bytes)) } /// Return the underlying integer as raw bytes in native endian /// format. #[inline] pub fn to_ne_bytes(&self) -> [u8; 4] { self.0.to_ne_bytes() } /// Returns an iterator over all values from 0 up to and not /// including the given length. /// /// If the given length exceeds this type's limit, then this /// panics. pub(crate) fn iter(len: usize) -> $iter { $iter::new(len) } } // We write our own Debug impl so that we get things like PatternID(5) // instead of PatternID(SmallIndex(5)). impl core::fmt::Debug for $name { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish() } } impl core::ops::Index<$name> for [T] { type Output = T; #[inline] fn index(&self, index: $name) -> &T { &self[index.as_usize()] } } impl core::ops::IndexMut<$name> for [T] { #[inline] fn index_mut(&mut self, index: $name) -> &mut T { &mut self[index.as_usize()] } } impl core::ops::Index<$name> for Vec { type Output = T; #[inline] fn index(&self, index: $name) -> &T { &self[index.as_usize()] } } impl core::ops::IndexMut<$name> for Vec { #[inline] fn index_mut(&mut self, index: $name) -> &mut T { &mut self[index.as_usize()] } } impl From for $name { fn from(index: SmallIndex) -> $name { $name(index) } } impl From for $name { fn from(value: u8) -> $name { $name(SmallIndex::from(value)) } } impl TryFrom for $name { type Error = $err; fn try_from(value: u16) -> Result<$name, $err> { SmallIndex::try_from(value).map($name).map_err($err) } } impl TryFrom for $name { type Error = $err; fn try_from(value: u32) -> Result<$name, $err> { SmallIndex::try_from(value).map($name).map_err($err) } } impl TryFrom for $name { type Error = $err; fn try_from(value: u64) -> Result<$name, $err> { SmallIndex::try_from(value).map($name).map_err($err) } } impl TryFrom for $name { type Error = $err; fn try_from(value: usize) -> Result<$name, $err> { SmallIndex::try_from(value).map($name).map_err($err) } } /// This error occurs when an ID could not be constructed. /// /// This occurs when given an integer exceeding the maximum allowed /// value. /// /// When the `std` feature is enabled, this implements the `Error` /// trait. #[derive(Clone, Debug, Eq, PartialEq)] pub struct $err(SmallIndexError); impl $err { /// Returns the value that could not be converted to an ID. pub fn attempted(&self) -> u64 { self.0.attempted() } } #[cfg(feature = "std")] impl std::error::Error for $err {} impl core::fmt::Display for $err { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "failed to create {} from {:?}, which exceeds {:?}", stringify!($name), self.attempted(), $name::MAX, ) } } #[derive(Clone, Debug)] pub(crate) struct $iter(SmallIndexIter); impl $iter { fn new(len: usize) -> $iter { assert!( len <= $name::LIMIT, "cannot create iterator for {} when number of \ elements exceed {:?}", stringify!($name), $name::LIMIT, ); $iter(SmallIndexIter { rng: 0..len }) } } impl Iterator for $iter { type Item = $name; fn next(&mut self) -> Option<$name> { self.0.next().map($name) } } /// An iterator adapter that is like std::iter::Enumerate, but attaches /// small index values instead. It requires `ExactSizeIterator`. At /// construction, it ensures that the index of each element in the /// iterator is representable in the corresponding small index type. #[derive(Clone, Debug)] pub(crate) struct $withiter { it: I, ids: $iter, } impl $withiter { fn new(it: I) -> $withiter { let ids = $name::iter(it.len()); $withiter { it, ids } } } impl Iterator for $withiter { type Item = ($name, I::Item); fn next(&mut self) -> Option<($name, I::Item)> { let item = self.it.next()?; // Number of elements in this iterator must match, according // to contract of ExactSizeIterator. let id = self.ids.next().unwrap(); Some((id, item)) } } }; } /// The identifier of a pattern in an Aho-Corasick automaton. /// /// It is represented by a `u32` even on 64-bit systems in order to conserve /// space. Namely, on all targets, this type guarantees that its value will /// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit /// targets, for example, this type's maximum value will never overflow an /// `isize`, which means it will never overflow a `i16` even though its /// internal representation is still a `u32`. /// /// # Safety /// /// While a `PatternID` is meant to guarantee that its value fits into `usize` /// without using as much space as a `usize` on all targets, callers must /// not rely on this property for safety. Callers may choose to rely on this /// property for correctness however. For example, creating a `StateID` with an /// invalid value can be done in entirely safe code. This may in turn result in /// panics or silent logical errors. #[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] #[repr(transparent)] pub struct PatternID(SmallIndex); /// The identifier of a finite automaton state. /// /// It is represented by a `u32` even on 64-bit systems in order to conserve /// space. Namely, on all targets, this type guarantees that its value will /// fit in a `u32`, `i32`, `usize` and an `isize`. This means that on 16-bit /// targets, for example, this type's maximum value will never overflow an /// `isize`, which means it will never overflow a `i16` even though its /// internal representation is still a `u32`. /// /// # Safety /// /// While a `StateID` is meant to guarantee that its value fits into `usize` /// without using as much space as a `usize` on all targets, callers must /// not rely on this property for safety. Callers may choose to rely on this /// property for correctness however. For example, creating a `StateID` with an /// invalid value can be done in entirely safe code. This may in turn result in /// panics or silent logical errors. #[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] #[repr(transparent)] pub struct StateID(SmallIndex); index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter); index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter); /// A utility trait that defines a couple of adapters for making it convenient /// to access indices as "small index" types. We require ExactSizeIterator so /// that iterator construction can do a single check to make sure the index of /// each element is representable by its small index type. pub(crate) trait IteratorIndexExt: Iterator { fn with_pattern_ids(self) -> WithPatternIDIter where Self: Sized + ExactSizeIterator, { WithPatternIDIter::new(self) } fn with_state_ids(self) -> WithStateIDIter where Self: Sized + ExactSizeIterator, { WithStateIDIter::new(self) } } impl IteratorIndexExt for I {} aho-corasick-1.1.3/src/util/remapper.rs000064400000000000000000000224421046102023000161220ustar 00000000000000use alloc::vec::Vec; use crate::{nfa::noncontiguous, util::primitives::StateID}; /// Remappable is a tightly coupled abstraction that facilitates remapping /// state identifiers in DFAs. /// /// The main idea behind remapping state IDs is that DFAs often need to check /// if a certain state is a "special" state of some kind (like a match state) /// during a search. Since this is extremely perf critical code, we want this /// check to be as fast as possible. Partitioning state IDs into, for example, /// into "non-match" and "match" states means one can tell if a state is a /// match state via a simple comparison of the state ID. /// /// The issue is that during the DFA construction process, it's not /// particularly easy to partition the states. Instead, the simplest thing is /// to often just do a pass over all of the states and shuffle them into their /// desired partitionings. To do that, we need a mechanism for swapping states. /// Hence, this abstraction. /// /// Normally, for such little code, I would just duplicate it. But this is a /// key optimization and the implementation is a bit subtle. So the abstraction /// is basically a ham-fisted attempt at DRY. The only place we use this is in /// the dense and one-pass DFAs. /// /// See also src/dfa/special.rs for a more detailed explanation of how dense /// DFAs are partitioned. pub(crate) trait Remappable: core::fmt::Debug { /// Return the total number of states. fn state_len(&self) -> usize; /// Swap the states pointed to by the given IDs. The underlying finite /// state machine should be mutated such that all of the transitions in /// `id1` are now in the memory region where the transitions for `id2` /// were, and all of the transitions in `id2` are now in the memory region /// where the transitions for `id1` were. /// /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`. /// /// It is expected that, after calling this, the underlying state machine /// will be left in an inconsistent state, since any other transitions /// pointing to, e.g., `id1` need to be updated to point to `id2`, since /// that's where `id1` moved to. /// /// In order to "fix" the underlying inconsistent state, a `Remapper` /// should be used to guarantee that `remap` is called at the appropriate /// time. fn swap_states(&mut self, id1: StateID, id2: StateID); /// This must remap every single state ID in the underlying value according /// to the function given. For example, in a DFA, this should remap every /// transition and every starting state ID. fn remap(&mut self, map: impl Fn(StateID) -> StateID); } /// Remapper is an abstraction the manages the remapping of state IDs in a /// finite state machine. This is useful when one wants to shuffle states into /// different positions in the machine. /// /// One of the key complexities this manages is the ability to correctly move /// one state multiple times. /// /// Once shuffling is complete, `remap` must be called, which will rewrite /// all pertinent transitions to updated state IDs. Neglecting to call `remap` /// will almost certainly result in a corrupt machine. #[derive(Debug)] pub(crate) struct Remapper { /// A map from the index of a state to its pre-multiplied identifier. /// /// When a state is swapped with another, then their corresponding /// locations in this map are also swapped. Thus, its new position will /// still point to its old pre-multiplied StateID. /// /// While there is a bit more to it, this then allows us to rewrite the /// state IDs in a DFA's transition table in a single pass. This is done /// by iterating over every ID in this map, then iterating over each /// transition for the state at that ID and re-mapping the transition from /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position /// in this map where `old_id` *started*, and set it to where it ended up /// after all swaps have been completed. map: Vec, /// A way to map indices to state IDs (and back). idx: IndexMapper, } impl Remapper { /// Create a new remapper from the given remappable implementation. The /// remapper can then be used to swap states. The remappable value given /// here must the same one given to `swap` and `remap`. /// /// The given stride should be the stride of the transition table expressed /// as a power of 2. This stride is used to map between state IDs and state /// indices. If state IDs and state indices are equivalent, then provide /// a `stride2` of `0`, which acts as an identity. pub(crate) fn new(r: &impl Remappable, stride2: usize) -> Remapper { let idx = IndexMapper { stride2 }; let map = (0..r.state_len()).map(|i| idx.to_state_id(i)).collect(); Remapper { map, idx } } /// Swap two states. Once this is called, callers must follow through to /// call `remap`, or else it's possible for the underlying remappable /// value to be in a corrupt state. pub(crate) fn swap( &mut self, r: &mut impl Remappable, id1: StateID, id2: StateID, ) { if id1 == id2 { return; } r.swap_states(id1, id2); self.map.swap(self.idx.to_index(id1), self.idx.to_index(id2)); } /// Complete the remapping process by rewriting all state IDs in the /// remappable value according to the swaps performed. pub(crate) fn remap(mut self, r: &mut impl Remappable) { // Update the map to account for states that have been swapped // multiple times. For example, if (A, C) and (C, G) are swapped, then // transitions previously pointing to A should now point to G. But if // we don't update our map, they will erroneously be set to C. All we // do is follow the swaps in our map until we see our original state // ID. // // The intuition here is to think about how changes are made to the // map: only through pairwise swaps. That means that starting at any // given state, it is always possible to find the loop back to that // state by following the swaps represented in the map (which might be // 0 swaps). // // We are also careful to clone the map before starting in order to // freeze it. We use the frozen map to find our loops, since we need to // update our map as well. Without freezing it, our updates could break // the loops referenced above and produce incorrect results. let oldmap = self.map.clone(); for i in 0..r.state_len() { let cur_id = self.idx.to_state_id(i); let mut new_id = oldmap[i]; if cur_id == new_id { continue; } loop { let id = oldmap[self.idx.to_index(new_id)]; if cur_id == id { self.map[i] = new_id; break; } new_id = id; } } r.remap(|sid| self.map[self.idx.to_index(sid)]); } } /// A simple type for mapping between state indices and state IDs. /// /// The reason why this exists is because state IDs are "premultiplied" in a /// DFA. That is, in order to get to the transitions for a particular state, /// one need only use the state ID as-is, instead of having to multiply it by /// transition table's stride. /// /// The downside of this is that it's inconvenient to map between state IDs /// using a dense map, e.g., Vec. That's because state IDs look like /// `0`, `stride`, `2*stride`, `3*stride`, etc., instead of `0`, `1`, `2`, `3`, /// etc. /// /// Since our state IDs are premultiplied, we can convert back-and-forth /// between IDs and indices by simply unmultiplying the IDs and multiplying the /// indices. /// /// Note that for a sparse NFA, state IDs and indices are equivalent. In this /// case, we set the stride of the index mapped to be `0`, which acts as an /// identity. #[derive(Debug)] struct IndexMapper { /// The power of 2 corresponding to the stride of the corresponding /// transition table. 'id >> stride2' de-multiplies an ID while 'index << /// stride2' pre-multiplies an index to an ID. stride2: usize, } impl IndexMapper { /// Convert a state ID to a state index. fn to_index(&self, id: StateID) -> usize { id.as_usize() >> self.stride2 } /// Convert a state index to a state ID. fn to_state_id(&self, index: usize) -> StateID { // CORRECTNESS: If the given index is not valid, then it is not // required for this to panic or return a valid state ID. We'll "just" // wind up with panics or silent logic errors at some other point. But // this is OK because if Remappable::state_len is correct and so is // 'to_index', then all inputs to 'to_state_id' should be valid indices // and thus transform into valid state IDs. StateID::new_unchecked(index << self.stride2) } } impl Remappable for noncontiguous::NFA { fn state_len(&self) -> usize { noncontiguous::NFA::states(self).len() } fn swap_states(&mut self, id1: StateID, id2: StateID) { noncontiguous::NFA::swap_states(self, id1, id2) } fn remap(&mut self, map: impl Fn(StateID) -> StateID) { noncontiguous::NFA::remap(self, map) } } aho-corasick-1.1.3/src/util/search.rs000064400000000000000000001174011046102023000155540ustar 00000000000000use core::ops::{Range, RangeBounds}; use crate::util::primitives::PatternID; /// The configuration and the haystack to use for an Aho-Corasick search. /// /// When executing a search, there are a few parameters one might want to /// configure: /// /// * The haystack to search, provided to the [`Input::new`] constructor. This /// is the only required parameter. /// * The span _within_ the haystack to limit a search to. (The default /// is the entire haystack.) This is configured via [`Input::span`] or /// [`Input::range`]. /// * Whether to run an unanchored (matches can occur anywhere after the /// start of the search) or anchored (matches can only occur beginning at /// the start of the search) search. Unanchored search is the default. This is /// configured via [`Input::anchored`]. /// * Whether to quit the search as soon as a match has been found, regardless /// of the [`MatchKind`] that the searcher was built with. This is configured /// via [`Input::earliest`]. /// /// For most cases, the defaults for all optional parameters are appropriate. /// The utility of this type is that it keeps the default or common case simple /// while permitting tweaking parameters in more niche use cases while reusing /// the same search APIs. /// /// # Valid bounds and search termination /// /// An `Input` permits setting the bounds of a search via either /// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or /// else a panic will occur. Bounds are valid if and only if: /// /// * The bounds represent a valid range into the input's haystack. /// * **or** the end bound is a valid ending bound for the haystack *and* /// the start bound is exactly one greater than the end bound. /// /// In the latter case, [`Input::is_done`] will return true and indicates any /// search receiving such an input should immediately return with no match. /// /// Other than representing "search is complete," the `Input::span` and /// `Input::range` APIs are never necessary. Instead, callers can slice the /// haystack instead, e.g., with `&haystack[start..end]`. With that said, they /// can be more convenient than slicing because the match positions reported /// when using `Input::span` or `Input::range` are in terms of the original /// haystack. If you instead use `&haystack[start..end]`, then you'll need to /// add `start` to any match position returned in order for it to be a correct /// index into `haystack`. /// /// # Example: `&str` and `&[u8]` automatically convert to an `Input` /// /// There is a `From<&T> for Input` implementation for all `T: AsRef<[u8]>`. /// Additionally, the [`AhoCorasick`](crate::AhoCorasick) search APIs accept /// a `Into`. These two things combined together mean you can provide /// things like `&str` and `&[u8]` to search APIs when the defaults are /// suitable, but also an `Input` when they're not. For example: /// /// ``` /// use aho_corasick::{AhoCorasick, Anchored, Input, Match, StartKind}; /// /// // Build a searcher that supports both unanchored and anchored modes. /// let ac = AhoCorasick::builder() /// .start_kind(StartKind::Both) /// .build(&["abcd", "b"]) /// .unwrap(); /// let haystack = "abcd"; /// /// // A search using default parameters is unanchored. With standard /// // semantics, this finds `b` first. /// assert_eq!( /// Some(Match::must(1, 1..2)), /// ac.find(haystack), /// ); /// // Using the same 'find' routine, we can provide an 'Input' explicitly /// // that is configured to do an anchored search. Since 'b' doesn't start /// // at the beginning of the search, it is not reported as a match. /// assert_eq!( /// Some(Match::must(0, 0..4)), /// ac.find(Input::new(haystack).anchored(Anchored::Yes)), /// ); /// ``` #[derive(Clone)] pub struct Input<'h> { haystack: &'h [u8], span: Span, anchored: Anchored, earliest: bool, } impl<'h> Input<'h> { /// Create a new search configuration for the given haystack. #[inline] pub fn new>(haystack: &'h H) -> Input<'h> { Input { haystack: haystack.as_ref(), span: Span { start: 0, end: haystack.as_ref().len() }, anchored: Anchored::No, earliest: false, } } /// Set the span for this search. /// /// This routine is generic over how a span is provided. While /// a [`Span`] may be given directly, one may also provide a /// `std::ops::Range`. To provide anything supported by range /// syntax, use the [`Input::range`] method. /// /// The default span is the entire haystack. /// /// Note that [`Input::range`] overrides this method and vice versa. /// /// # Panics /// /// This panics if the given span does not correspond to valid bounds in /// the haystack or the termination of a search. /// /// # Example /// /// This example shows how the span of the search can impact whether a /// match is reported or not. /// /// ``` /// use aho_corasick::{AhoCorasick, Input, MatchKind}; /// /// let patterns = &["b", "abcd", "abc"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// let input = Input::new(haystack).span(0..3); /// let mat = ac.try_find(input)?.expect("should have a match"); /// // Without the span stopping the search early, 'abcd' would be reported /// // because it is the correct leftmost-first match. /// assert_eq!("abc", &haystack[mat.span()]); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn span>(mut self, span: S) -> Input<'h> { self.set_span(span); self } /// Like `Input::span`, but accepts any range instead. /// /// The default range is the entire haystack. /// /// Note that [`Input::span`] overrides this method and vice versa. /// /// # Panics /// /// This routine will panic if the given range could not be converted /// to a valid [`Range`]. For example, this would panic when given /// `0..=usize::MAX` since it cannot be represented using a half-open /// interval in terms of `usize`. /// /// This routine also panics if the given range does not correspond to /// valid bounds in the haystack or the termination of a search. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// /// let input = Input::new("foobar").range(2..=4); /// assert_eq!(2..5, input.get_range()); /// ``` #[inline] pub fn range>(mut self, range: R) -> Input<'h> { self.set_range(range); self } /// Sets the anchor mode of a search. /// /// When a search is anchored (via [`Anchored::Yes`]), a match must begin /// at the start of a search. When a search is not anchored (that's /// [`Anchored::No`]), searchers will look for a match anywhere in the /// haystack. /// /// By default, the anchored mode is [`Anchored::No`]. /// /// # Support for anchored searches /// /// Anchored or unanchored searches might not always be available, /// depending on the type of searcher used and its configuration: /// /// * [`noncontiguous::NFA`](crate::nfa::noncontiguous::NFA) always /// supports both unanchored and anchored searches. /// * [`contiguous::NFA`](crate::nfa::contiguous::NFA) always supports both /// unanchored and anchored searches. /// * [`dfa::DFA`](crate::dfa::DFA) supports only unanchored /// searches by default. /// [`dfa::Builder::start_kind`](crate::dfa::Builder::start_kind) can /// be used to change the default to supporting both kinds of searches /// or even just anchored searches. /// * [`AhoCorasick`](crate::AhoCorasick) inherits the same setup as a /// `DFA`. Namely, it only supports unanchored searches by default, but /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) /// can change this. /// /// If you try to execute a search using a `try_` ("fallible") method with /// an unsupported anchor mode, then an error will be returned. For calls /// to infallible search methods, a panic will result. /// /// # Example /// /// This demonstrates the differences between an anchored search and /// an unanchored search. Notice that we build our `AhoCorasick` searcher /// with [`StartKind::Both`] so that it supports both unanchored and /// anchored searches simultaneously. /// /// ``` /// use aho_corasick::{ /// AhoCorasick, Anchored, Input, MatchKind, StartKind, /// }; /// /// let patterns = &["bcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasick::builder() /// .start_kind(StartKind::Both) /// .build(patterns) /// .unwrap(); /// /// // Note that 'Anchored::No' is the default, so it doesn't need to /// // be explicitly specified here. /// let input = Input::new(haystack); /// let mat = ac.try_find(input)?.expect("should have a match"); /// assert_eq!("bcd", &haystack[mat.span()]); /// /// // While 'bcd' occurs in the haystack, it does not begin where our /// // search begins, so no match is found. /// let input = Input::new(haystack).anchored(Anchored::Yes); /// assert_eq!(None, ac.try_find(input)?); /// /// // However, if we start our search where 'bcd' starts, then we will /// // find a match. /// let input = Input::new(haystack).range(1..).anchored(Anchored::Yes); /// let mat = ac.try_find(input)?.expect("should have a match"); /// assert_eq!("bcd", &haystack[mat.span()]); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn anchored(mut self, mode: Anchored) -> Input<'h> { self.set_anchored(mode); self } /// Whether to execute an "earliest" search or not. /// /// When running a non-overlapping search, an "earliest" search will /// return the match location as early as possible. For example, given /// the patterns `abc` and `b`, and a haystack of `abc`, a normal /// leftmost-first search will return `abc` as a match. But an "earliest" /// search will return as soon as it is known that a match occurs, which /// happens once `b` is seen. /// /// Note that when using [`MatchKind::Standard`], the "earliest" option /// has no effect since standard semantics are already "earliest." Note /// also that this has no effect in overlapping searches, since overlapping /// searches also use standard semantics and report all possible matches. /// /// This is disabled by default. /// /// # Example /// /// This example shows the difference between "earliest" searching and /// normal leftmost searching. /// /// ``` /// use aho_corasick::{AhoCorasick, Anchored, Input, MatchKind, StartKind}; /// /// let patterns = &["abc", "b"]; /// let haystack = "abc"; /// /// let ac = AhoCorasick::builder() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns) /// .unwrap(); /// /// // The normal leftmost-first match. /// let input = Input::new(haystack); /// let mat = ac.try_find(input)?.expect("should have a match"); /// assert_eq!("abc", &haystack[mat.span()]); /// /// // The "earliest" possible match, even if it isn't leftmost-first. /// let input = Input::new(haystack).earliest(true); /// let mat = ac.try_find(input)?.expect("should have a match"); /// assert_eq!("b", &haystack[mat.span()]); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn earliest(mut self, yes: bool) -> Input<'h> { self.set_earliest(yes); self } /// Set the span for this search configuration. /// /// This is like the [`Input::span`] method, except this mutates the /// span in place. /// /// This routine is generic over how a span is provided. While /// a [`Span`] may be given directly, one may also provide a /// `std::ops::Range`. /// /// # Panics /// /// This panics if the given span does not correspond to valid bounds in /// the haystack or the termination of a search. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let mut input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// input.set_span(2..4); /// assert_eq!(2..4, input.get_range()); /// ``` #[inline] pub fn set_span>(&mut self, span: S) { let span = span.into(); assert!( span.end <= self.haystack.len() && span.start <= span.end.wrapping_add(1), "invalid span {:?} for haystack of length {}", span, self.haystack.len(), ); self.span = span; } /// Set the span for this search configuration given any range. /// /// This is like the [`Input::range`] method, except this mutates the /// span in place. /// /// # Panics /// /// This routine will panic if the given range could not be converted /// to a valid [`Range`]. For example, this would panic when given /// `0..=usize::MAX` since it cannot be represented using a half-open /// interval in terms of `usize`. /// /// This routine also panics if the given range does not correspond to /// valid bounds in the haystack or the termination of a search. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let mut input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// input.set_range(2..=4); /// assert_eq!(2..5, input.get_range()); /// ``` #[inline] pub fn set_range>(&mut self, range: R) { use core::ops::Bound; // It's a little weird to convert ranges into spans, and then spans // back into ranges when we actually slice the haystack. Because // of that process, we always represent everything as a half-open // internal. Therefore, handling things like m..=n is a little awkward. let start = match range.start_bound() { Bound::Included(&i) => i, // Can this case ever happen? Range syntax doesn't support it... Bound::Excluded(&i) => i.checked_add(1).unwrap(), Bound::Unbounded => 0, }; let end = match range.end_bound() { Bound::Included(&i) => i.checked_add(1).unwrap(), Bound::Excluded(&i) => i, Bound::Unbounded => self.haystack().len(), }; self.set_span(Span { start, end }); } /// Set the starting offset for the span for this search configuration. /// /// This is a convenience routine for only mutating the start of a span /// without having to set the entire span. /// /// # Panics /// /// This panics if the given span does not correspond to valid bounds in /// the haystack or the termination of a search. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let mut input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// input.set_start(5); /// assert_eq!(5..6, input.get_range()); /// ``` #[inline] pub fn set_start(&mut self, start: usize) { self.set_span(Span { start, ..self.get_span() }); } /// Set the ending offset for the span for this search configuration. /// /// This is a convenience routine for only mutating the end of a span /// without having to set the entire span. /// /// # Panics /// /// This panics if the given span does not correspond to valid bounds in /// the haystack or the termination of a search. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let mut input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// input.set_end(5); /// assert_eq!(0..5, input.get_range()); /// ``` #[inline] pub fn set_end(&mut self, end: usize) { self.set_span(Span { end, ..self.get_span() }); } /// Set the anchor mode of a search. /// /// This is like [`Input::anchored`], except it mutates the search /// configuration in place. /// /// # Example /// /// ``` /// use aho_corasick::{Anchored, Input}; /// /// let mut input = Input::new("foobar"); /// assert_eq!(Anchored::No, input.get_anchored()); /// /// input.set_anchored(Anchored::Yes); /// assert_eq!(Anchored::Yes, input.get_anchored()); /// ``` #[inline] pub fn set_anchored(&mut self, mode: Anchored) { self.anchored = mode; } /// Set whether the search should execute in "earliest" mode or not. /// /// This is like [`Input::earliest`], except it mutates the search /// configuration in place. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let mut input = Input::new("foobar"); /// assert!(!input.get_earliest()); /// input.set_earliest(true); /// assert!(input.get_earliest()); /// ``` #[inline] pub fn set_earliest(&mut self, yes: bool) { self.earliest = yes; } /// Return a borrow of the underlying haystack as a slice of bytes. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(b"foobar", input.haystack()); /// ``` #[inline] pub fn haystack(&self) -> &[u8] { self.haystack } /// Return the start position of this search. /// /// This is a convenience routine for `search.get_span().start()`. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(0, input.start()); /// /// let input = Input::new("foobar").span(2..4); /// assert_eq!(2, input.start()); /// ``` #[inline] pub fn start(&self) -> usize { self.get_span().start } /// Return the end position of this search. /// /// This is a convenience routine for `search.get_span().end()`. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(6, input.end()); /// /// let input = Input::new("foobar").span(2..4); /// assert_eq!(4, input.end()); /// ``` #[inline] pub fn end(&self) -> usize { self.get_span().end } /// Return the span for this search configuration. /// /// If one was not explicitly set, then the span corresponds to the entire /// range of the haystack. /// /// # Example /// /// ``` /// use aho_corasick::{Input, Span}; /// /// let input = Input::new("foobar"); /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); /// ``` #[inline] pub fn get_span(&self) -> Span { self.span } /// Return the span as a range for this search configuration. /// /// If one was not explicitly set, then the span corresponds to the entire /// range of the haystack. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// ``` #[inline] pub fn get_range(&self) -> Range { self.get_span().range() } /// Return the anchored mode for this search configuration. /// /// If no anchored mode was set, then it defaults to [`Anchored::No`]. /// /// # Example /// /// ``` /// use aho_corasick::{Anchored, Input}; /// /// let mut input = Input::new("foobar"); /// assert_eq!(Anchored::No, input.get_anchored()); /// /// input.set_anchored(Anchored::Yes); /// assert_eq!(Anchored::Yes, input.get_anchored()); /// ``` #[inline] pub fn get_anchored(&self) -> Anchored { self.anchored } /// Return whether this search should execute in "earliest" mode. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let input = Input::new("foobar"); /// assert!(!input.get_earliest()); /// ``` #[inline] pub fn get_earliest(&self) -> bool { self.earliest } /// Return true if this input has been exhausted, which in turn means all /// subsequent searches will return no matches. /// /// This occurs precisely when the start position of this search is greater /// than the end position of the search. /// /// # Example /// /// ``` /// use aho_corasick::Input; /// /// let mut input = Input::new("foobar"); /// assert!(!input.is_done()); /// input.set_start(6); /// assert!(!input.is_done()); /// input.set_start(7); /// assert!(input.is_done()); /// ``` #[inline] pub fn is_done(&self) -> bool { self.get_span().start > self.get_span().end } } impl<'h> core::fmt::Debug for Input<'h> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let mut fmter = f.debug_struct("Input"); match core::str::from_utf8(self.haystack()) { Ok(nice) => fmter.field("haystack", &nice), Err(_) => fmter.field("haystack", &self.haystack()), } .field("span", &self.span) .field("anchored", &self.anchored) .field("earliest", &self.earliest) .finish() } } impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { #[inline] fn from(haystack: &'h H) -> Input<'h> { Input::new(haystack) } } /// A representation of a range in a haystack. /// /// A span corresponds to the starting and ending _byte offsets_ of a /// contiguous region of bytes. The starting offset is inclusive while the /// ending offset is exclusive. That is, a span is a half-open interval. /// /// A span is used to report the offsets of a match, but it is also used to /// convey which region of a haystack should be searched via routines like /// [`Input::span`]. /// /// This is basically equivalent to a `std::ops::Range`, except this /// type implements `Copy` which makes it more ergonomic to use in the context /// of this crate. Indeed, `Span` exists only because `Range` does /// not implement `Copy`. Like a range, this implements `Index` for `[u8]` /// and `str`, and `IndexMut` for `[u8]`. For convenience, this also impls /// `From`, which means things like `Span::from(5..10)` work. /// /// There are no constraints on the values of a span. It is, for example, legal /// to create a span where `start > end`. #[derive(Clone, Copy, Eq, Hash, PartialEq)] pub struct Span { /// The start offset of the span, inclusive. pub start: usize, /// The end offset of the span, exclusive. pub end: usize, } impl Span { /// Returns this span as a range. #[inline] pub fn range(&self) -> Range { Range::from(*self) } /// Returns true when this span is empty. That is, when `start >= end`. #[inline] pub fn is_empty(&self) -> bool { self.start >= self.end } /// Returns the length of this span. /// /// This returns `0` in precisely the cases that `is_empty` returns `true`. #[inline] pub fn len(&self) -> usize { self.end.saturating_sub(self.start) } /// Returns true when the given offset is contained within this span. /// /// Note that an empty span contains no offsets and will always return /// false. #[inline] pub fn contains(&self, offset: usize) -> bool { !self.is_empty() && self.start <= offset && offset <= self.end } /// Returns a new span with `offset` added to this span's `start` and `end` /// values. #[inline] pub fn offset(&self, offset: usize) -> Span { Span { start: self.start + offset, end: self.end + offset } } } impl core::fmt::Debug for Span { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "{}..{}", self.start, self.end) } } impl core::ops::Index for [u8] { type Output = [u8]; #[inline] fn index(&self, index: Span) -> &[u8] { &self[index.range()] } } impl core::ops::IndexMut for [u8] { #[inline] fn index_mut(&mut self, index: Span) -> &mut [u8] { &mut self[index.range()] } } impl core::ops::Index for str { type Output = str; #[inline] fn index(&self, index: Span) -> &str { &self[index.range()] } } impl From> for Span { #[inline] fn from(range: Range) -> Span { Span { start: range.start, end: range.end } } } impl From for Range { #[inline] fn from(span: Span) -> Range { Range { start: span.start, end: span.end } } } impl PartialEq> for Span { #[inline] fn eq(&self, range: &Range) -> bool { self.start == range.start && self.end == range.end } } impl PartialEq for Range { #[inline] fn eq(&self, span: &Span) -> bool { self.start == span.start && self.end == span.end } } /// The type of anchored search to perform. /// /// If an Aho-Corasick searcher does not support the anchored mode selected, /// then the search will return an error or panic, depending on whether a /// fallible or an infallible routine was called. #[non_exhaustive] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Anchored { /// Run an unanchored search. This means a match may occur anywhere at or /// after the start position of the search up until the end position of the /// search. No, /// Run an anchored search. This means that a match must begin at the start /// position of the search and end before the end position of the search. Yes, } impl Anchored { /// Returns true if and only if this anchor mode corresponds to an anchored /// search. /// /// # Example /// /// ``` /// use aho_corasick::Anchored; /// /// assert!(!Anchored::No.is_anchored()); /// assert!(Anchored::Yes.is_anchored()); /// ``` #[inline] pub fn is_anchored(&self) -> bool { matches!(*self, Anchored::Yes) } } /// A representation of a match reported by an Aho-Corasick searcher. /// /// A match has two essential pieces of information: the [`PatternID`] that /// matches, and the [`Span`] of the match in a haystack. /// /// The pattern is identified by an ID, which corresponds to its position /// (starting from `0`) relative to other patterns used to construct the /// corresponding searcher. If only a single pattern is provided, then all /// matches are guaranteed to have a pattern ID of `0`. /// /// Every match reported by a searcher guarantees that its span has its start /// offset as less than or equal to its end offset. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub struct Match { /// The pattern ID. pattern: PatternID, /// The underlying match span. span: Span, } impl Match { /// Create a new match from a pattern ID and a span. /// /// This constructor is generic over how a span is provided. While /// a [`Span`] may be given directly, one may also provide a /// `std::ops::Range`. /// /// # Panics /// /// This panics if `end < start`. /// /// # Example /// /// This shows how to create a match for the first pattern in an /// Aho-Corasick searcher using convenient range syntax. /// /// ``` /// use aho_corasick::{Match, PatternID}; /// /// let m = Match::new(PatternID::ZERO, 5..10); /// assert_eq!(0, m.pattern().as_usize()); /// assert_eq!(5, m.start()); /// assert_eq!(10, m.end()); /// ``` #[inline] pub fn new>(pattern: PatternID, span: S) -> Match { let span = span.into(); assert!(span.start <= span.end, "invalid match span"); Match { pattern, span } } /// Create a new match from a pattern ID and a byte offset span. /// /// This constructor is generic over how a span is provided. While /// a [`Span`] may be given directly, one may also provide a /// `std::ops::Range`. /// /// This is like [`Match::new`], but accepts a `usize` instead of a /// [`PatternID`]. This panics if the given `usize` is not representable /// as a `PatternID`. /// /// # Panics /// /// This panics if `end < start` or if `pattern > PatternID::MAX`. /// /// # Example /// /// This shows how to create a match for the third pattern in an /// Aho-Corasick searcher using convenient range syntax. /// /// ``` /// use aho_corasick::Match; /// /// let m = Match::must(3, 5..10); /// assert_eq!(3, m.pattern().as_usize()); /// assert_eq!(5, m.start()); /// assert_eq!(10, m.end()); /// ``` #[inline] pub fn must>(pattern: usize, span: S) -> Match { Match::new(PatternID::must(pattern), span) } /// Returns the ID of the pattern that matched. /// /// The ID of a pattern is derived from the position in which it was /// originally inserted into the corresponding searcher. The first pattern /// has identifier `0`, and each subsequent pattern is `1`, `2` and so on. #[inline] pub fn pattern(&self) -> PatternID { self.pattern } /// The starting position of the match. /// /// This is a convenience routine for `Match::span().start`. #[inline] pub fn start(&self) -> usize { self.span().start } /// The ending position of the match. /// /// This is a convenience routine for `Match::span().end`. #[inline] pub fn end(&self) -> usize { self.span().end } /// Returns the match span as a range. /// /// This is a convenience routine for `Match::span().range()`. #[inline] pub fn range(&self) -> core::ops::Range { self.span().range() } /// Returns the span for this match. #[inline] pub fn span(&self) -> Span { self.span } /// Returns true when the span in this match is empty. /// /// An empty match can only be returned when empty pattern is in the /// Aho-Corasick searcher. #[inline] pub fn is_empty(&self) -> bool { self.span().is_empty() } /// Returns the length of this match. /// /// This returns `0` in precisely the cases that `is_empty` returns `true`. #[inline] pub fn len(&self) -> usize { self.span().len() } /// Returns a new match with `offset` added to its span's `start` and `end` /// values. #[inline] pub fn offset(&self, offset: usize) -> Match { Match { pattern: self.pattern, span: Span { start: self.start() + offset, end: self.end() + offset, }, } } } /// A knob for controlling the match semantics of an Aho-Corasick automaton. /// /// There are two generally different ways that Aho-Corasick automatons can /// report matches. The first way is the "standard" approach that results from /// implementing most textbook explanations of Aho-Corasick. The second way is /// to report only the leftmost non-overlapping matches. The leftmost approach /// is in turn split into two different ways of resolving ambiguous matches: /// leftmost-first and leftmost-longest. /// /// The `Standard` match kind is the default and is the only one that supports /// overlapping matches and stream searching. (Trying to find overlapping or /// streaming matches using leftmost match semantics will result in an error in /// fallible APIs and a panic when using infallibe APIs.) The `Standard` match /// kind will report matches as they are seen. When searching for overlapping /// matches, then all possible matches are reported. When searching for /// non-overlapping matches, the first match seen is reported. For example, for /// non-overlapping matches, given the patterns `abcd` and `b` and the haystack /// `abcdef`, only a match for `b` is reported since it is detected first. The /// `abcd` match is never reported since it overlaps with the `b` match. /// /// In contrast, the leftmost match kind always prefers the leftmost match /// among all possible matches. Given the same example as above with `abcd` and /// `b` as patterns and `abcdef` as the haystack, the leftmost match is `abcd` /// since it begins before the `b` match, even though the `b` match is detected /// before the `abcd` match. In this case, the `b` match is not reported at all /// since it overlaps with the `abcd` match. /// /// The difference between leftmost-first and leftmost-longest is in how they /// resolve ambiguous matches when there are multiple leftmost matches to /// choose from. Leftmost-first always chooses the pattern that was provided /// earliest, where as leftmost-longest always chooses the longest matching /// pattern. For example, given the patterns `a` and `ab` and the subject /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match /// is `ab`. Conversely, if the patterns were given in reverse order, i.e., /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches /// would be `ab`. Stated differently, the leftmost-first match depends on the /// order in which the patterns were given to the Aho-Corasick automaton. /// Because of that, when leftmost-first matching is used, if a pattern `A` /// that appears before a pattern `B` is a prefix of `B`, then it is impossible /// to ever observe a match of `B`. /// /// If you're not sure which match kind to pick, then stick with the standard /// kind, which is the default. In particular, if you need overlapping or /// streaming matches, then you _must_ use the standard kind. The leftmost /// kinds are useful in specific circumstances. For example, leftmost-first can /// be very useful as a way to implement match priority based on the order of /// patterns given and leftmost-longest can be useful for dictionary searching /// such that only the longest matching words are reported. /// /// # Relationship with regular expression alternations /// /// Understanding match semantics can be a little tricky, and one easy way /// to conceptualize non-overlapping matches from an Aho-Corasick automaton /// is to think about them as a simple alternation of literals in a regular /// expression. For example, let's say we wanted to match the strings /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It /// turns out that regular expression engines have two different ways of /// matching this alternation. The first way, leftmost-longest, is commonly /// found in POSIX compatible implementations of regular expressions (such as /// `grep`). The second way, leftmost-first, is commonly found in backtracking /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's /// regex engine do not use backtracking, but still implement leftmost-first /// semantics in an effort to match the behavior of dominant backtracking /// regex engines such as those found in Perl, Ruby, Python, Javascript and /// PHP.) /// /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex /// will match `Samwise` because it is the longest possible match, but a /// Perl-like regex will match `Sam` since it appears earlier in the /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine /// will never match `Samwise` since `Sam` will always have higher priority. /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is /// still longest match, but it also appears earlier than `Sam`. /// /// The "standard" match semantics of Aho-Corasick generally don't correspond /// to the match semantics of any large group of regex implementations, so /// there's no direct analogy that can be made here. Standard match semantics /// are generally useful for overlapping matches, or if you just want to see /// matches as they are detected. /// /// The main conclusion to draw from this section is that the match semantics /// can be tweaked to precisely match either Perl-like regex alternations or /// POSIX regex alternations. #[non_exhaustive] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum MatchKind { /// Use standard match semantics, which support overlapping matches. When /// used with non-overlapping matches, matches are reported as they are /// seen. Standard, /// Use leftmost-first match semantics, which reports leftmost matches. /// When there are multiple possible leftmost matches, the match /// corresponding to the pattern that appeared earlier when constructing /// the automaton is reported. /// /// This does **not** support overlapping matches or stream searching. If /// this match kind is used, attempting to find overlapping matches or /// stream matches will fail. LeftmostFirst, /// Use leftmost-longest match semantics, which reports leftmost matches. /// When there are multiple possible leftmost matches, the longest match /// is chosen. /// /// This does **not** support overlapping matches or stream searching. If /// this match kind is used, attempting to find overlapping matches or /// stream matches will fail. LeftmostLongest, } /// The default match kind is `MatchKind::Standard`. impl Default for MatchKind { fn default() -> MatchKind { MatchKind::Standard } } impl MatchKind { #[inline] pub(crate) fn is_standard(&self) -> bool { matches!(*self, MatchKind::Standard) } #[inline] pub(crate) fn is_leftmost(&self) -> bool { matches!(*self, MatchKind::LeftmostFirst | MatchKind::LeftmostLongest) } #[inline] pub(crate) fn is_leftmost_first(&self) -> bool { matches!(*self, MatchKind::LeftmostFirst) } /// Convert this match kind into a packed match kind. If this match kind /// corresponds to standard semantics, then this returns None, since /// packed searching does not support standard semantics. #[inline] pub(crate) fn as_packed(&self) -> Option { match *self { MatchKind::Standard => None, MatchKind::LeftmostFirst => { Some(crate::packed::MatchKind::LeftmostFirst) } MatchKind::LeftmostLongest => { Some(crate::packed::MatchKind::LeftmostLongest) } } } } /// The kind of anchored starting configurations to support in an Aho-Corasick /// searcher. /// /// Depending on which searcher is used internally by /// [`AhoCorasick`](crate::AhoCorasick), supporting both unanchored /// and anchored searches can be quite costly. For this reason, /// [`AhoCorasickBuilder::start_kind`](crate::AhoCorasickBuilder::start_kind) /// can be used to configure whether your searcher supports unanchored, /// anchored or both kinds of searches. /// /// This searcher configuration knob works in concert with the search time /// configuration [`Input::anchored`]. Namely, if one requests an unsupported /// anchored mode, then the search will either panic or return an error, /// depending on whether you're using infallible or fallibe APIs, respectively. /// /// `AhoCorasick` by default only supports unanchored searches. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum StartKind { /// Support both anchored and unanchored searches. Both, /// Support only unanchored searches. Requesting an anchored search will /// return an error in fallible APIs and panic in infallible APIs. Unanchored, /// Support only anchored searches. Requesting an unanchored search will /// return an error in fallible APIs and panic in infallible APIs. Anchored, } impl Default for StartKind { fn default() -> StartKind { StartKind::Unanchored } } aho-corasick-1.1.3/src/util/special.rs000064400000000000000000000035001046102023000157210ustar 00000000000000use crate::util::primitives::StateID; /// A collection of sentinel state IDs for Aho-Corasick automata. /// /// This specifically enables the technique by which we determine which states /// are dead, matches or start states. Namely, by arranging states in a /// particular order, we can determine the type of a state simply by looking at /// its ID. #[derive(Clone, Debug)] pub(crate) struct Special { /// The maximum ID of all the "special" states. This corresponds either to /// start_anchored_id when a prefilter is active and max_match_id when a /// prefilter is not active. The idea here is that if there is no prefilter, /// then there is no point in treating start states as special. pub(crate) max_special_id: StateID, /// The maximum ID of all the match states. Any state ID bigger than this /// is guaranteed to be a non-match ID. /// /// It is possible and legal for max_match_id to be equal to /// start_anchored_id, which occurs precisely in the case where the empty /// string is a pattern that was added to the underlying automaton. pub(crate) max_match_id: StateID, /// The state ID of the start state used for unanchored searches. pub(crate) start_unanchored_id: StateID, /// The state ID of the start state used for anchored searches. This is /// always start_unanchored_id+1. pub(crate) start_anchored_id: StateID, } impl Special { /// Create a new set of "special" state IDs with all IDs initialized to /// zero. The general idea here is that they will be updated and set to /// correct values later. pub(crate) fn zero() -> Special { Special { max_special_id: StateID::ZERO, max_match_id: StateID::ZERO, start_unanchored_id: StateID::ZERO, start_anchored_id: StateID::ZERO, } } }