aho-corasick-0.7.8/.github/workflows/ci.yml010066400017500000144000000060311360641721700170320ustar0000000000000000name: ci on: pull_request: push: branches: - master schedule: - cron: '00 01 * * *' jobs: test: name: test env: # For some builds, we use cross to test on 32-bit and big-endian # systems. CARGO: cargo # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`. TARGET: runs-on: ${{ matrix.os }} strategy: matrix: build: - pinned - stable - stable-32 - stable-mips - beta - nightly - macos - win-msvc - win-gnu include: - build: pinned os: ubuntu-18.04 rust: 1.28.0 - build: stable os: ubuntu-18.04 rust: stable - build: stable-32 os: ubuntu-18.04 rust: stable target: i686-unknown-linux-gnu - build: stable-mips os: ubuntu-18.04 rust: stable target: mips64-unknown-linux-gnuabi64 - build: beta os: ubuntu-18.04 rust: beta - build: nightly os: ubuntu-18.04 rust: nightly - build: macos os: macos-latest rust: stable - build: win-msvc os: windows-2019 rust: stable - build: win-gnu os: windows-2019 rust: stable-x86_64-gnu steps: - name: Checkout repository uses: actions/checkout@v1 with: fetch-depth: 1 - name: Install Rust uses: actions-rs/toolchain@v1 with: toolchain: ${{ matrix.rust }} profile: minimal override: true - name: Use Cross if: matrix.target != '' run: | # FIXME: to work around bugs in latest cross release, install master. # See: https://github.com/rust-embedded/cross/issues/357 cargo install --git https://github.com/rust-embedded/cross echo "::set-env name=CARGO::cross" echo "::set-env name=TARGET::--target ${{ matrix.target }}" - name: Show command used for Cargo run: | echo "cargo command is: ${{ env.CARGO }}" echo "target flag is: ${{ env.TARGET }}" - name: Show CPU info for debugging if: matrix.os == 'ubuntu-18.04' run: lscpu - run: ${{ env.CARGO }} build --verbose - run: ${{ env.CARGO }} doc --verbose - run: ${{ env.CARGO }} test --verbose - if: matrix.build == 'nightly' run: ${{ env.CARGO }} build --manifest-path aho-corasick-debug/Cargo.toml - if: matrix.build == 'nightly' run: ${{ env.CARGO }} bench --verbose --manifest-path bench/Cargo.toml -- --test rustfmt: name: rustfmt runs-on: ubuntu-18.04 steps: - name: Checkout repository uses: actions/checkout@v1 with: fetch-depth: 1 - name: Install Rust uses: actions-rs/toolchain@v1 with: toolchain: stable profile: minimal components: rustfmt - name: Install rustfmt run: rustup component add rustfmt - name: Check formatting run: | cargo fmt --all -- --check aho-corasick-0.7.8/.gitignore010066400017500001731000000002111361627453200143060ustar0000000000000000.*.swp doc tags examples/ss10pusa.csv build target /Cargo.lock scratch* bench_large/huge BREADCRUMBS /tmp /aho-corasick-debug/Cargo.lock aho-corasick-0.7.8/COPYING010064400017500000144000000001761274016735300133550ustar0000000000000000This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. aho-corasick-0.7.8/Cargo.toml.orig010066400017500001731000000021601361627454500152160ustar0000000000000000[package] name = "aho-corasick" version = "0.7.8" #:version authors = ["Andrew Gallant "] description = "Fast multiple substring searching." homepage = "https://github.com/BurntSushi/aho-corasick" repository = "https://github.com/BurntSushi/aho-corasick" readme = "README.md" keywords = ["string", "search", "text", "aho", "multi"] license = "Unlicense/MIT" categories = ["text-processing"] autotests = false exclude = [ "/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml", ] [workspace] members = ["bench"] # We'd ideally not do this, but since the debug tool uses Rust 2018, older # versions of Rust (such as 1.28) fail to parse the manifest because it treats # `edition = "2018"` as an unstable feature. # # When we move our MSRV to Rust 2018, then we should be able to add this back # to the workspace. exclude = ["aho-corasick-debug"] [lib] name = "aho_corasick" [features] default = ["std"] std = ["memchr/use_std"] [dependencies] memchr = { version = "2.2.0", default-features = false } [dev-dependencies] doc-comment = "0.3.1" [profile.release] debug = true [profile.bench] debug = true aho-corasick-0.7.8/Cargo.toml0000644000000023761361627455300115330ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "aho-corasick" version = "0.7.8" authors = ["Andrew Gallant "] exclude = ["/aho-corasick-debug", "/ci/*", "/.travis.yml", "/appveyor.yml"] autotests = false description = "Fast multiple substring searching." homepage = "https://github.com/BurntSushi/aho-corasick" readme = "README.md" keywords = ["string", "search", "text", "aho", "multi"] categories = ["text-processing"] license = "Unlicense/MIT" repository = "https://github.com/BurntSushi/aho-corasick" [profile.bench] debug = true [profile.release] debug = true [lib] name = "aho_corasick" [dependencies.memchr] version = "2.2.0" default-features = false [dev-dependencies.doc-comment] version = "0.3.1" [features] default = ["std"] std = ["memchr/use_std"] aho-corasick-0.7.8/DESIGN.md010066400017500001731000000602071361627453200136240ustar0000000000000000This document describes the internal design of this crate, which is an object lesson in what happens when you take a fairly simple old algorithm like Aho-Corasick and make it fast and production ready. The target audience of this document is Rust programmers that have some familiarity with string searching, however, one does not need to know the Aho-Corasick algorithm in order to read this (it is explained below). One should, however, know what a trie is. (If you don't, go read its Wikipedia article.) The center-piece of this crate is an implementation of Aho-Corasick. On its own, Aho-Corasick isn't that complicated. The complex pieces come from the different variants of Aho-Corasick implemented in this crate. Specifically, they are: * Aho-Corasick as an NFA, using dense transitions near the root with sparse transitions elsewhere. * Aho-Corasick as a DFA. (An NFA is slower to search, but cheaper to construct and uses less memory.) * A DFA with pre-multiplied state identifiers. This saves a multiplication instruction in the core search loop. * A DFA with equivalence classes of bytes as the alphabet, instead of the traditional 256-byte alphabet. This shrinks the size of the DFA in memory, but adds an extra lookup in the core search loop to map the input byte to an equivalent class. * The option to choose how state identifiers are represented, via one of u8, u16, u32, u64 or usize. This permits creating compact automatons when matching a small number of patterns. * Supporting "standard" match semantics, along with its overlapping variant, in addition to leftmost-first and leftmost-longest semantics. The "standard" semantics are typically what you see in a textbook description of Aho-Corasick. However, Aho-Corasick is also useful as an optimization in regex engines, which often use leftmost-first or leftmost-longest semantics. Thus, it is useful to implement those semantics here. The "standard" and "leftmost" search algorithms are subtly different, and also require slightly different construction algorithms. * Support for ASCII case insensitive matching. * Support for accelerating searches when the patterns all start with a small number of fixed bytes. Or alternatively, when the patterns all contain a small number of rare bytes. (Searching for these bytes uses SIMD vectorized code courtesy of `memchr`.) * Transparent support for alternative SIMD vectorized search routines for smaller number of literals, such as the Teddy algorithm. We called these "packed" search routines because they use SIMD. They can often be an order of magnitude faster than just Aho-Corasick, but don't scale as well. * Support for searching streams. This can reuse most of the underlying code, but does require careful buffering support. * Support for anchored searches, which permit efficient `is_prefix` checks for a large number of patterns. When you combine all of this together along with trying to make everything as fast as possible, what you end up with is enitrely too much code with too much `unsafe`. Alas, I was not smart enough to figure out how to reduce it. Instead, we will explain it. # Basics The fundamental problem this crate is trying to solve is to determine the occurrences of possibly many patterns in a haystack. The naive way to solve this is to look for a match for each pattern at each position in the haystack: for i in 0..haystack.len(): for p in patterns.iter(): if haystack[i..].starts_with(p.bytes()): return Match(p.id(), i, i + p.bytes().len()) Those four lines are effectively all this crate does. The problem with those four lines is that they are very slow, especially when you're searching for a large number of patterns. While there are many different algorithms available to solve this, a popular one is Aho-Corasick. It's a common solution because it's not too hard to implement, scales quite well even when searching for thousands of patterns and is generally pretty fast. Aho-Corasick does well here because, regardless of the number of patterns you're searching for, it always visits each byte in the haystack exactly once. This means, generally speaking, adding more patterns to an Aho-Corasick automaton does not make it slower. (Strictly speaking, however, this is not true, since a larger automaton will make less effective use of the CPU's cache.) Aho-Corasick can be succinctly described as a trie with state transitions between some of the nodes that efficiently instruct the search algorithm to try matching alternative keys in the automaton. The trick is that these state transitions are arranged such that each byte of input needs to be inspected only once. These state transitions are typically called "failure transitions," because they instruct the searcher (the thing traversing the automaton while reading from the haystack) what to do when a byte in the haystack does not correspond to a valid transition in the current state of the trie. More formally, a failure transition points to a state in the automaton that may lead to a match whose prefix is a proper suffix of the path traversed through the trie so far. (If no such proper suffix exists, then the failure transition points back to the start state of the trie, effectively restarting the search.) This is perhaps simpler to explain pictorally. For example, let's say we built an Aho-Corasick automaton with the following patterns: 'abcd' and 'cef'. The trie looks like this: a - S1 - b - S2 - c - S3 - d - S4* / S0 - c - S5 - e - S6 - f - S7* where states marked with a `*` are match states (meaning, the search algorithm should stop and report a match to the caller). So given this trie, it should be somewhat straight-forward to see how it can be used to determine whether any particular haystack *starts* with either `abcd` or `cef`. It's easy to express this in code: fn has_prefix(trie: &Trie, haystack: &[u8]) -> bool { let mut state_id = trie.start(); // If the empty pattern is in trie, then state_id is a match state. if trie.is_match(state_id) { return true; } for (i, &b) in haystack.iter().enumerate() { state_id = match trie.next_state(state_id, b) { Some(id) => id, // If there was no transition for this state and byte, then we know // the haystack does not start with one of the patterns in our trie. None => return false, }; if trie.is_match(state_id) { return true; } } false } And that's pretty much it. All we do is move through the trie starting with the bytes at the beginning of the haystack. If we find ourselves in a position where we can't move, or if we've looked through the entire haystack without seeing a match state, then we know the haystack does not start with any of the patterns in the trie. The meat of the Aho-Corasick algorithm is in how we add failure transitions to our trie to keep searching efficient. Specifically, it permits us to not only check whether a haystack *starts* with any one of a number of patterns, but rather, whether the haystack contains any of a number of patterns *anywhere* in the haystack. As mentioned before, failure transitions connect a proper suffix of the path traversed through the trie before, with a path that leads to a match that has a prefix corresponding to that proper suffix. So in our case, for patterns `abcd` and `cef`, with a haystack `abcef`, we want to transition to state `S5` (from the diagram above) from `S3` upon seeing that the byte following `c` is not `d`. Namely, the proper suffix in this example is `c`, which is a prefix of `cef`. So the modified diagram looks like this: a - S1 - b - S2 - c - S3 - d - S4* / / / ---------------- / / S0 - c - S5 - e - S6 - f - S7* One thing that isn't shown in this diagram is that *all* states have a failure transition, but only `S3` has a *non-trivial* failure transition. That is, all other states have a failure transition back to the start state. So if our haystack was `abzabcd`, then the searcher would transition back to `S0` after seeing `z`, which effectively restarts the search. (Because there is no pattern in our trie that has a prefix of `bz` or `z`.) The code for traversing this *automaton* or *finite state machine* (it is no longer just a trie) is not that much different from the `has_prefix` code above: fn contains(fsm: &FiniteStateMachine, haystack: &[u8]) -> bool { let mut state_id = fsm.start(); // If the empty pattern is in fsm, then state_id is a match state. if fsm.is_match(state_id) { return true; } for (i, &b) in haystack.iter().enumerate() { // While the diagram above doesn't show this, we may wind up needing // to follow multiple failure transitions before we land on a state // in which we can advance. Therefore, when searching for the next // state, we need to loop until we don't see a failure transition. // // This loop terminates because the start state has no empty // transitions. Every transition from the start state either points to // another state, or loops back to the start state. loop { match fsm.next_state(state_id, b) { Some(id) => { state_id = id; break; } // Unlike our code above, if there was no transition for this // state, then we don't quit. Instead, we look for this state's // failure transition and follow that instead. None => { state_id = fsm.next_fail_state(state_id); } }; } if fsm.is_match(state_id) { return true; } } false } Other than the complication around traversing failure transitions, this code is still roughly "traverse the automaton with bytes from the haystack, and quit when a match is seen." And that concludes our section on the basics. While we didn't go deep into how the automaton is built (see `src/nfa.rs`, which has detailed comments about that), the basic structure of Aho-Corasick should be reasonably clear. # NFAs and DFAs There are generally two types of finite automata: non-deterministic finite automata (NFA) and deterministic finite automata (DFA). The difference between them is, principally, that an NFA can be in multiple states at once. This is typically accomplished by things called _epsilon_ transitions, where one could move to a new state without consuming any bytes from the input. (The other mechanism by which NFAs can be in more than one state is where the same byte in a particular state transitions to multiple distinct states.) In contrast, a DFA can only ever be in one state at a time. A DFA has no epsilon transitions, and for any given state, a byte transitions to at most one other state. By this formulation, the Aho-Corasick automaton described in the previous section is an NFA. This is because failure transitions are, effectively, epsilon transitions. That is, whenever the automaton is in state `S`, it is actually in the set of states that are reachable by recursively following failure transitions from `S`. (This means that, for example, the start state is always active since the start state is reachable via failure transitions from any state in the automaton.) NFAs have a lot of nice properties. They tend to be easier to construct, and also tend to use less memory. However, their primary downside is that they are typically slower to execute. For example, the code above showing how to search with an Aho-Corasick automaton needs to potentially iterate through many failure transitions for every byte of input. While this is a fairly small amount of overhead, this can add up, especially if the automaton has a lot of overlapping patterns with a lot of failure transitions. A DFA's search code, by contrast, looks like this: fn contains(dfa: &DFA, haystack: &[u8]) -> bool { let mut state_id = dfa.start(); // If the empty pattern is in dfa, then state_id is a match state. if dfa.is_match(state_id) { return true; } for (i, &b) in haystack.iter().enumerate() { // An Aho-Corasick DFA *never* has a missing state that requires // failure transitions to be followed. One byte of input advances the // automaton by one state. Always. state_id = trie.next_state(state_id, b); if fsm.is_match(state_id) { return true; } } false } The search logic here is much simpler than for the NFA, and this tends to translate into significant performance benefits as well, since there's a lot less work being done for each byte in the haystack. How is this accomplished? It's done by pre-following all failure transitions for all states for all bytes in the alphabet, and then building a single state transition table. Building this DFA can be much more costly than building the NFA, and use much more memory, but the better performance can be worth it. Users of this crate can actually choose between using an NFA or a DFA. By default, an NFA is used, because it typically strikes the best balance between space usage and search performance. But the DFA option is available for cases where a little extra memory and upfront time building the automaton is okay. For example, the `AhoCorasick::auto_configure` and `AhoCorasickBuilder::auto_configure` methods will enable the DFA setting if there are a small number of patterns. # More DFA tricks As described in the previous section, one of the downsides of using a DFA is that is uses more memory and can take longer to build. One small way of mitigating these concerns is to map the alphabet used by the automaton into a smaller space. Typically, the alphabet of a DFA has 256 elements in it: one element for each possible value that fits into a byte. However, in many cases, one does not need the full alphabet. For example, if all patterns in an Aho-Corasick automaton are ASCII letters, then this only uses up 52 distinct bytes. As far as the automaton is concerned, the rest of the 204 bytes are indistinguishable from one another: they will never disrciminate between a match or a non-match. Therefore, in cases like that, the alphabet can be shrunk to just 53 elements. One for each ASCII letter, and then another to serve as a placeholder for every other unused byte. In practice, this library doesn't quite compute the optimal set of equivalence classes, but it's close enough in most cases. The key idea is that this then allows the transition table for the DFA to be potentially much smaller. The downside of doing this, however, is that since the transition table is defined in terms of this smaller alphabet space, every byte in the haystack must be re-mapped to this smaller space. This requires an additional 256-byte table. In practice, this can lead to a small search time hit, but it can be difficult to measure. Moreover, it can sometimes lead to faster search times for bigger automata, since it could be difference between more parts of the automaton staying in the CPU cache or not. One other trick for DFAs employed by this crate is the notion of premultiplying state identifiers. Specifically, the normal way to compute the next transition in a DFA is via the following (assuming that the transition table is laid out sequentially in memory, in row-major order, where the rows are states): next_state_id = dfa.transitions[current_state_id * 256 + current_byte] However, since the value `256` is a fixed constant, we can actually premultiply the state identifiers in the table when we build the table initially. Then, the next transition computation simply becomes: next_state_id = dfa.transitions[current_state_id + current_byte] This doesn't seem like much, but when this is being executed for every byte of input that you're searching, saving that extra multiplication instruction can add up. The same optimization works even when equivalence classes are enabled, as described above. The only difference is that the premultiplication is by the total number of equivalence classes instead of 256. There isn't much downside to premultiplying state identifiers, other than the fact that you may need to choose a bigger integer representation than you would otherwise. For example, if you don't premultiply state identifiers, then an automaton that uses `u8` as a state identifier can hold up to 256 states. However, if they are premultiplied, then it can only hold up to `floor(256 / len(alphabet))` states. Thus premultiplication impacts how compact your DFA can be. In practice, it's pretty rare to use `u8` as a state identifier, so premultiplication is usually a good thing to do. Both equivalence classes and premultiplication are tuneable parameters via the `AhoCorasickBuilder` type, and both are enabled by default. # Match semantics One of the more interesting things about this implementation of Aho-Corasick that (as far as this author knows) separates it from other implementations, is that it natively supports leftmost-first and leftmost-longest match semantics. Briefly, match semantics refer to the decision procedure by which searching will disambiguate matches when there are multiple to choose from: * **standard** match semantics emits matches as soon as they are detected by the automaton. This is typically equivalent to the textbook non-overlapping formulation of Aho-Corasick. * **leftmost-first** match semantics means that 1) the next match is the match starting at the leftmost position and 2) among multiple matches starting at the same leftmost position, the match corresponding to the pattern provided first by the caller is reported. * **leftmost-longest** is like leftmost-first, except when there are multiple matches starting at the same leftmost position, the pattern corresponding to the longest match is returned. (The crate API documentation discusses these differences, with examples, in more depth on the `MatchKind` type.) The reason why supporting these match semantics is important is because it gives the user more control over the match procedure. For example, leftmost-first permits users to implement match priority by simply putting the higher priority patterns first. Leftmost-longest, on the other hand, permits finding the longest possible match, which might be useful when trying to find words matching a dictionary. Additionally, regex engines often want to use Aho-Corasick as an optimization when searching for an alternation of literals. In order to preserve correct match semantics, regex engines typically can't use the standard textbook definition directly, since regex engines will implement either leftmost-first (Perl-like) or leftmost-longest (POSIX) match semantics. Supporting leftmost semantics requires a couple key changes: * Constructing the Aho-Corasick automaton changes a bit in both how the trie is constructed and how failure transitions are found. Namely, only a subset of the failure transitions are added. Specifically, only the failure transitions that either do not occur after a match or do occur after a match but preserve that match are kept. (More details on this can be found in `src/nfa.rs`.) * The search algorithm changes slightly. Since we are looking for the leftmost match, we cannot quit as soon as a match is detected. Instead, after a match is detected, we must keep searching until either the end of the input or until a dead state is seen. (Dead states are not used for standard match semantics. Dead states mean that searching should stop after a match has been found.) Other implementations of Aho-Corasick do support leftmost match semantics, but they do it with more overhead at search time, or even worse, with a queue of matches and sophisticated hijinks to disambiguate the matches. While our construction algorithm becomes a bit more complicated, the correct match semantics fall out from the structure of the automaton itself. # Overlapping matches One of the nice properties of an Aho-Corasick automaton is that it can report all possible matches, even when they overlap with one another. In this mode, the match semantics don't matter, since all possible matches are reported. Overlapping searches work just like regular searches, except the state identifier at which the previous search left off is carried over to the next search, so that it can pick up where it left off. If there are additional matches at that state, then they are reported before resuming the search. Enabling leftmost-first or leftmost-longest match semantics causes the automaton to use a subset of all failure transitions, which means that overlapping searches cannot be used. Therefore, if leftmost match semantics are used, attempting to do an overlapping search will panic. Thus, to get overlapping searches, the caller must use the default standard match semantics. This behavior was chosen because there are only two alternatives, which were deemed worse: * Compile two automatons internally, one for standard semantics and one for the semantics requested by the caller (if not standard). * Create a new type, distinct from the `AhoCorasick` type, which has different capabilities based on the configuration options. The first is untenable because of the amount of memory used by the automaton. The second increases the complexity of the API too much by adding too many types that do similar things. It is conceptually much simpler to keep all searching isolated to a single type. Callers may query whether the automaton supports overlapping searches via the `AhoCorasick::supports_overlapping` method. # Stream searching Since Aho-Corasick is an automaton, it is possible to do partial searches on partial parts of the haystack, and then resume that search on subsequent pieces of the haystack. This is useful when the haystack you're trying to search is not stored contiguous in memory, or if one does not want to read the entire haystack into memory at once. Currently, only standard semantics are supported for stream searching. This is some of the more complicated code in this crate, and is something I would very much like to improve. In particular, it currently has the restriction that it must buffer at least enough of the haystack in memory in order to fit the longest possible match. The difficulty in getting stream searching right is that the implementation choices (such as the buffer size) often impact what the API looks like and what it's allowed to do. # Prefilters In some cases, Aho-Corasick is not the fastest way to find matches containing multiple patterns. Sometimes, the search can be accelerated using highly optimized SIMD routines. For example, consider searching the following patterns: Sherlock Moriarty Watson It is plausible that it would be much faster to quickly look for occurrences of the leading bytes, `S`, `M` or `W`, before trying to start searching via the automaton. Indeed, this is exactly what this crate will do. When there are more than three distinct starting bytes, then this crate will look for three distinct bytes occurring at any position in the patterns, while preferring bytes that are heuristically determined to be rare over others. For example: Abuzz Sanchez Vasquez Topaz Waltz Here, we have more than 3 distinct starting bytes, but all of the patterns contain `z`, which is typically a rare byte. In this case, the prefilter will scan for `z`, back up a bit, and then execute the Aho-Corasick automaton. If all of that fails, then a packed multiple substring algorithm will be attempted. Currently, the only algorithm available for this is Teddy, but more may be added in the future. Teddy is unlike the above prefilters in that it confirms its own matches, so when Teddy is active, it might not be necessary for Aho-Corasick to run at all. (See `Automaton::leftmost_find_at_no_state_imp` in `src/automaton.rs`.) However, the current Teddy implementation only works in `x86_64` and when SSSE3 or AVX2 are available, and moreover, only works _well_ when there are a small number of patterns (say, less than 100). Teddy also requires the haystack to be of a certain length (more than 16-34 bytes). When the haystack is shorter than that, Rabin-Karp is used instead. (See `src/packed/rabinkarp.rs`.) There is a more thorough description of Teddy at [`src/packed/teddy/README.md`](src/packed/teddy/README.md). aho-corasick-0.7.8/LICENSE-MIT010064400017500000144000000020711274016735300137520ustar0000000000000000The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. aho-corasick-0.7.8/README.md010066400017500000144000000134231360640551600136000ustar0000000000000000aho-corasick ============ A library for finding occurrences of many patterns at once with SIMD acceleration in some cases. This library provides multiple pattern search principally through an implementation of the [Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), which builds a finite state machine for executing searches in linear time. Features include case insensitive matching, overlapping matches and search & replace in streams. [![Build status](https://github.com/BurntSushi/aho-corasick/workflows/ci/badge.svg)](https://github.com/BurntSushi/aho-corasick/actions) [![](http://meritbadge.herokuapp.com/aho-corasick)](https://crates.io/crates/aho-corasick) Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). ### Documentation https://docs.rs/aho-corasick ### Usage Add this to your `Cargo.toml`: ```toml [dependencies] aho-corasick = "0.7" ``` and this to your crate root (if you're using Rust 2015): ```rust extern crate aho_corasick; ``` ### Example: basic searching This example shows how to search for occurrences of multiple patterns simultaneously. Each match includes the pattern that matched along with the byte offsets of the match. ```rust use aho_corasick::AhoCorasick; let patterns = &["apple", "maple", "Snapple"]; let haystack = "Nobody likes maple in their apple flavored Snapple."; let ac = AhoCorasick::new(patterns); let mut matches = vec![]; for mat in ac.find_iter(haystack) { matches.push((mat.pattern(), mat.start(), mat.end())); } assert_eq!(matches, vec![ (1, 13, 18), (0, 28, 33), (2, 43, 50), ]); ``` ### Example: case insensitivity This is like the previous example, but matches `Snapple` case insensitively using `AhoCorasickBuilder`: ```rust use aho_corasick::AhoCorasickBuilder; let patterns = &["apple", "maple", "snapple"]; let haystack = "Nobody likes maple in their apple flavored Snapple."; let ac = AhoCorasickBuilder::new() .ascii_case_insensitive(true) .build(patterns); let mut matches = vec![]; for mat in ac.find_iter(haystack) { matches.push((mat.pattern(), mat.start(), mat.end())); } assert_eq!(matches, vec![ (1, 13, 18), (0, 28, 33), (2, 43, 50), ]); ``` ### Example: replacing matches in a stream This example shows how to execute a search and replace on a stream without loading the entire stream into memory first. ```rust use aho_corasick::AhoCorasick; let patterns = &["fox", "brown", "quick"]; let replace_with = &["sloth", "grey", "slow"]; // In a real example, these might be `std::fs::File`s instead. All you need to // do is supply a pair of `std::io::Read` and `std::io::Write` implementations. let rdr = "The quick brown fox."; let mut wtr = vec![]; let ac = AhoCorasick::new(patterns); ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?; assert_eq!(b"The slow grey sloth.".to_vec(), wtr); ``` ### Example: finding the leftmost first match In the textbook description of Aho-Corasick, its formulation is typically structured such that it reports all possible matches, even when they overlap with another. In many cases, overlapping matches may not be desired, such as the case of finding all successive non-overlapping matches like you might with a standard regular expression. Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do this doesn't always work in the expected way, since it will report matches as soon as they are seen. For example, consider matching the regex `Samwise|Sam` against the text `Samwise`. Most regex engines (that are Perl-like, or non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick algorithm modified for reporting non-overlapping matches will report `Sam`. A novel contribution of this library is the ability to change the match semantics of Aho-Corasick (without additional search time overhead) such that `Samwise` is reported instead. For example, here's the standard approach: ```rust use aho_corasick::AhoCorasick; let patterns = &["Samwise", "Sam"]; let haystack = "Samwise"; let ac = AhoCorasick::new(patterns); let mat = ac.find(haystack).expect("should have a match"); assert_eq!("Sam", &haystack[mat.start()..mat.end()]); ``` And now here's the leftmost-first version, which matches how a Perl-like regex will work: ```rust use aho_corasick::{AhoCorasickBuilder, MatchKind}; let patterns = &["Samwise", "Sam"]; let haystack = "Samwise"; let ac = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostFirst) .build(patterns); let mat = ac.find(haystack).expect("should have a match"); assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); ``` In addition to leftmost-first semantics, this library also supports leftmost-longest semantics, which match the POSIX behavior of a regular expression alternation. See `MatchKind` in the docs for more details. ### Minimum Rust version policy This crate's minimum supported `rustc` version is `1.28.0`. The current policy is that the minimum Rust version required to use this crate can be increased in minor version updates. For example, if `crate 1.0` requires Rust 1.20.0, then `crate 1.0.z` for all values of `z` will also require Rust 1.20.0 or newer. However, `crate 1.y` for `y > 0` may require a newer minimum version of Rust. In general, this crate will be conservative with respect to the minimum supported version of Rust. ### Future work Here are some plans for the future: * Assuming the current API is sufficient, I'd like to commit to it and release a `1.0` version of this crate some time in the next 6-12 months. * Support stream searching with leftmost match semantics. Currently, only standard match semantics are supported. Getting this right seems possible, but is tricky since the match state needs to be propagated through multiple searches. (With standard semantics, as soon as a match is seen the search ends.) aho-corasick-0.7.8/UNLICENSE010064400017500000144000000022731274016735300135720ustar0000000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to aho-corasick-0.7.8/rustfmt.toml010064400017500000144000000000541352131022200146750ustar0000000000000000max_width = 79 use_small_heuristics = "max" aho-corasick-0.7.8/src/ahocorasick.rs010064400017500000144000002306511352132431400157360ustar0000000000000000use std::io; use automaton::Automaton; use buffer::Buffer; use dfa::{self, DFA}; use error::Result; use nfa::{self, NFA}; use packed; use prefilter::PrefilterState; use state_id::StateID; use Match; /// An automaton for searching multiple strings in linear time. /// /// The `AhoCorasick` type supports a few basic ways of constructing an /// automaton, including /// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new) /// and /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). /// However, there are a fair number of configurable options that can be set /// by using /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) /// instead. Such options include, but are not limited to, how matches are /// determined, simple case insensitivity, whether to use a DFA or not and /// various knobs for controlling the space-vs-time trade offs taken when /// building the automaton. /// /// If you aren't sure where to start, try beginning with /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured). /// /// # Resource usage /// /// Aho-Corasick automatons are always constructed in `O(p)` time, where `p` /// is the combined length of all patterns being searched. With that said, /// building an automaton can be fairly costly because of high constant /// factors, particularly when enabling the /// [DFA](struct.AhoCorasickBuilder.html#method.dfa) /// option (which is disabled by default). For this reason, it's generally a /// good idea to build an automaton once and reuse it as much as possible. /// /// Aho-Corasick automatons can also use a fair bit of memory. To get a /// concrete idea of how much memory is being used, try using the /// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes) /// method. /// /// # Examples /// /// This example shows how to search for occurrences of multiple patterns /// simultaneously in a case insensitive fashion. Each match includes the /// pattern that matched along with the byte offsets of the match. /// /// ``` /// use aho_corasick::AhoCorasickBuilder; /// /// let patterns = &["apple", "maple", "snapple"]; /// let haystack = "Nobody likes maple in their apple flavored Snapple."; /// /// let ac = AhoCorasickBuilder::new() /// .ascii_case_insensitive(true) /// .build(patterns); /// let mut matches = vec![]; /// for mat in ac.find_iter(haystack) { /// matches.push((mat.pattern(), mat.start(), mat.end())); /// } /// assert_eq!(matches, vec![ /// (1, 13, 18), /// (0, 28, 33), /// (2, 43, 50), /// ]); /// ``` /// /// This example shows how to replace matches with some other string: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let patterns = &["fox", "brown", "quick"]; /// let haystack = "The quick brown fox."; /// let replace_with = &["sloth", "grey", "slow"]; /// /// let ac = AhoCorasick::new(patterns); /// let result = ac.replace_all(haystack, replace_with); /// assert_eq!(result, "The slow grey sloth."); /// ``` #[derive(Clone, Debug)] pub struct AhoCorasick { imp: Imp, match_kind: MatchKind, } impl AhoCorasick { /// Create a new Aho-Corasick automaton using the default configuration. /// /// The default configuration optimizes for less space usage, but at the /// expense of longer search times. To change the configuration, use /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) /// for fine-grained control, or /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) /// for automatic configuration if you aren't sure which settings to pick. /// /// This uses the default /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) /// match semantics, which reports a match as soon as it is found. This /// corresponds to the standard match semantics supported by textbook /// descriptions of the Aho-Corasick algorithm. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&[ /// "foo", "bar", "baz", /// ]); /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); /// ``` pub fn new(patterns: I) -> AhoCorasick where I: IntoIterator, P: AsRef<[u8]>, { AhoCorasickBuilder::new().build(patterns) } /// Build an Aho-Corasick automaton with an automatically determined /// configuration. /// /// Specifically, this requires a slice of patterns instead of an iterator /// since the configuration is determined by looking at the patterns before /// constructing the automaton. The idea here is to balance space and time /// automatically. That is, when searching a small number of patterns, this /// will attempt to use the fastest possible configuration since the total /// space required will be small anyway. As the number of patterns grows, /// this will fall back to slower configurations that use less space. /// /// If you want auto configuration but with match semantics different from /// the default `MatchKind::Standard`, then use /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure). /// /// # Examples /// /// Basic usage is just like `new`, except you must provide a slice: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new_auto_configured(&[ /// "foo", "bar", "baz", /// ]); /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); /// ``` pub fn new_auto_configured(patterns: &[B]) -> AhoCorasick where B: AsRef<[u8]>, { AhoCorasickBuilder::new().auto_configure(patterns).build(patterns) } } impl AhoCorasick { /// Returns true if and only if this automaton matches the haystack at any /// position. /// /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. /// This includes, but is not limited to, `String`, `&str`, `Vec`, and /// `&[u8]` itself. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&[ /// "foo", "bar", "quux", "baz", /// ]); /// assert!(ac.is_match("xxx bar xxx")); /// assert!(!ac.is_match("xxx qux xxx")); /// ``` pub fn is_match>(&self, haystack: B) -> bool { self.earliest_find(haystack).is_some() } /// Returns the location of the first detected match in `haystack`. /// /// This method has the same behavior regardless of the /// [`MatchKind`](enum.MatchKind.html) /// of this automaton. /// /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. /// This includes, but is not limited to, `String`, `&str`, `Vec`, and /// `&[u8]` itself. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&[ /// "abc", "b", /// ]); /// let mat = ac.earliest_find("abcd").expect("should have match"); /// assert_eq!(1, mat.pattern()); /// assert_eq!((1, 2), (mat.start(), mat.end())); /// ``` pub fn earliest_find>(&self, haystack: B) -> Option { let mut prestate = PrefilterState::new(self.max_pattern_len()); let mut start = self.imp.start_state(); self.imp.earliest_find_at( &mut prestate, haystack.as_ref(), 0, &mut start, ) } /// Returns the location of the first match according to the match /// semantics that this automaton was constructed with. /// /// When using `MatchKind::Standard`, this corresponds precisely to the /// same behavior as /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find). /// Otherwise, match semantics correspond to either /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst) /// or /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest). /// /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. /// This includes, but is not limited to, `String`, `&str`, `Vec`, and /// `&[u8]` itself. /// /// # Examples /// /// Basic usage, with standard semantics: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::Standard) // default, not necessary /// .build(patterns); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("b", &haystack[mat.start()..mat.end()]); /// ``` /// /// Now with leftmost-first semantics: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); /// ``` /// /// And finally, leftmost-longest semantics: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostLongest) /// .build(patterns); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); /// ``` pub fn find>(&self, haystack: B) -> Option { let mut prestate = PrefilterState::new(self.max_pattern_len()); self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0) } /// Returns an iterator of non-overlapping matches, using the match /// semantics that this automaton was constructed with. /// /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. /// This includes, but is not limited to, `String`, `&str`, `Vec`, and /// `&[u8]` itself. /// /// # Examples /// /// Basic usage, with standard semantics: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::Standard) // default, not necessary /// .build(patterns); /// let matches: Vec = ac /// .find_iter(haystack) /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![2, 2, 2], matches); /// ``` /// /// Now with leftmost-first semantics: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns); /// let matches: Vec = ac /// .find_iter(haystack) /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![0, 2, 0], matches); /// ``` /// /// And finally, leftmost-longest semantics: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostLongest) /// .build(patterns); /// let matches: Vec = ac /// .find_iter(haystack) /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![0, 2, 1], matches); /// ``` pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindIter<'a, 'b, S> { FindIter::new(self, haystack.as_ref()) } /// Returns an iterator of overlapping matches in the given `haystack`. /// /// Overlapping matches can _only_ be detected using /// `MatchKind::Standard` semantics. If this automaton was constructed with /// leftmost semantics, then this method will panic. To determine whether /// this will panic at runtime, use the /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping) /// method. /// /// `haystack` may be any type that is cheaply convertible to a `&[u8]`. /// This includes, but is not limited to, `String`, `&str`, `Vec`, and /// `&[u8]` itself. /// /// # Panics /// /// This panics when `AhoCorasick::supports_overlapping` returns `false`. /// That is, this panics when this automaton's match semantics are not /// `MatchKind::Standard`. /// /// # Examples /// /// Basic usage, with standard semantics: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns); /// let matches: Vec = ac /// .find_overlapping_iter(haystack) /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches); /// ``` pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindOverlappingIter<'a, 'b, S> { FindOverlappingIter::new(self, haystack.as_ref()) } /// Replace all matches with a corresponding value in the `replace_with` /// slice given. Matches correspond to the same matches as reported by /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). /// /// Replacements are determined by the index of the matching pattern. /// For example, if the pattern with index `2` is found, then it is /// replaced by `replace_with[2]`. /// /// # Panics /// /// This panics when `replace_with.len()` does not equal the total number /// of patterns that are matched by this automaton. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns); /// let result = ac.replace_all(haystack, &["x", "y", "z"]); /// assert_eq!("x the z to the xage", result); /// ``` pub fn replace_all(&self, haystack: &str, replace_with: &[B]) -> String where B: AsRef, { assert_eq!( replace_with.len(), self.pattern_count(), "replace_all requires a replacement for every pattern \ in the automaton" ); let mut dst = String::with_capacity(haystack.len()); self.replace_all_with(haystack, &mut dst, |mat, _, dst| { dst.push_str(replace_with[mat.pattern()].as_ref()); true }); dst } /// Replace all matches using raw bytes with a corresponding value in the /// `replace_with` slice given. Matches correspond to the same matches as /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter). /// /// Replacements are determined by the index of the matching pattern. /// For example, if the pattern with index `2` is found, then it is /// replaced by `replace_with[2]`. /// /// # Panics /// /// This panics when `replace_with.len()` does not equal the total number /// of patterns that are matched by this automaton. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = b"append the app to the appendage"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns); /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]); /// assert_eq!(b"x the z to the xage".to_vec(), result); /// ``` pub fn replace_all_bytes( &self, haystack: &[u8], replace_with: &[B], ) -> Vec where B: AsRef<[u8]>, { assert_eq!( replace_with.len(), self.pattern_count(), "replace_all_bytes requires a replacement for every pattern \ in the automaton" ); let mut dst = Vec::with_capacity(haystack.len()); self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| { dst.extend(replace_with[mat.pattern()].as_ref()); true }); dst } /// Replace all matches using a closure called on each match. /// Matches correspond to the same matches as reported by /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). /// /// The closure accepts three parameters: the match found, the text of /// the match and a string buffer with which to write the replaced text /// (if any). If the closure returns `true`, then it continues to the next /// match. If the closure returns false, then searching is stopped. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns); /// let mut result = String::new(); /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| { /// dst.push_str(&mat.pattern().to_string()); /// true /// }); /// assert_eq!("0 the 2 to the 0age", result); /// ``` pub fn replace_all_with( &self, haystack: &str, dst: &mut String, mut replace_with: F, ) where F: FnMut(&Match, &str, &mut String) -> bool, { let mut last_match = 0; for mat in self.find_iter(haystack) { dst.push_str(&haystack[last_match..mat.start()]); last_match = mat.end(); replace_with(&mat, &haystack[mat.start()..mat.end()], dst); } dst.push_str(&haystack[last_match..]); } /// Replace all matches using raw bytes with a closure called on each /// match. Matches correspond to the same matches as reported by /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). /// /// The closure accepts three parameters: the match found, the text of /// the match and a byte buffer with which to write the replaced text /// (if any). If the closure returns `true`, then it continues to the next /// match. If the closure returns false, then searching is stopped. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["append", "appendage", "app"]; /// let haystack = b"append the app to the appendage"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns); /// let mut result = vec![]; /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| { /// dst.extend(mat.pattern().to_string().bytes()); /// true /// }); /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result); /// ``` pub fn replace_all_with_bytes( &self, haystack: &[u8], dst: &mut Vec, mut replace_with: F, ) where F: FnMut(&Match, &[u8], &mut Vec) -> bool, { let mut last_match = 0; for mat in self.find_iter(haystack) { dst.extend(&haystack[last_match..mat.start()]); last_match = mat.end(); replace_with(&mat, &haystack[mat.start()..mat.end()], dst); } dst.extend(&haystack[last_match..]); } /// Returns an iterator of non-overlapping matches in the given /// stream. Matches correspond to the same matches as reported by /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). /// /// The matches yielded by this iterator use absolute position offsets in /// the stream given, where the first byte has index `0`. Matches are /// yieled until the stream is exhausted. /// /// Each item yielded by the iterator is an `io::Result`, where an /// error is yielded if there was a problem reading from the reader given. /// /// When searching a stream, an internal buffer is used. Therefore, callers /// should avoiding providing a buffered reader, if possible. /// /// Searching a stream requires that the automaton was built with /// `MatchKind::Standard` semantics. If this automaton was constructed /// with leftmost semantics, then this method will panic. To determine /// whether this will panic at runtime, use the /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) /// method. /// /// # Memory usage /// /// In general, searching streams will use a constant amount of memory for /// its internal buffer. The one requirement is that the internal buffer /// must be at least the size of the longest possible match. In most use /// cases, the default buffer size will be much larger than any individual /// match. /// /// # Panics /// /// This panics when `AhoCorasick::supports_stream` returns `false`. /// That is, this panics when this automaton's match semantics are not /// `MatchKind::Standard`. This restriction may be lifted in the future. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// # fn example() -> Result<(), ::std::io::Error> { /// let patterns = &["append", "appendage", "app"]; /// let haystack = "append the app to the appendage"; /// /// let ac = AhoCorasick::new(patterns); /// let mut matches = vec![]; /// for result in ac.stream_find_iter(haystack.as_bytes()) { /// let mat = result?; /// matches.push(mat.pattern()); /// } /// assert_eq!(vec![2, 2, 2], matches); /// # Ok(()) }; example().unwrap() /// ``` pub fn stream_find_iter<'a, R: io::Read>( &'a self, rdr: R, ) -> StreamFindIter<'a, R, S> { StreamFindIter::new(self, rdr) } /// Search for and replace all matches of this automaton in /// the given reader, and write the replacements to the given /// writer. Matches correspond to the same matches as reported by /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). /// /// Replacements are determined by the index of the matching pattern. /// For example, if the pattern with index `2` is found, then it is /// replaced by `replace_with[2]`. /// /// After all matches are replaced, the writer is _not_ flushed. /// /// If there was a problem reading from the given reader or writing to the /// given writer, then the corresponding `io::Error` is returned and all /// replacement is stopped. /// /// When searching a stream, an internal buffer is used. Therefore, callers /// should avoiding providing a buffered reader, if possible. However, /// callers may want to provide a buffered writer. /// /// Searching a stream requires that the automaton was built with /// `MatchKind::Standard` semantics. If this automaton was constructed /// with leftmost semantics, then this method will panic. To determine /// whether this will panic at runtime, use the /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) /// method. /// /// # Memory usage /// /// In general, searching streams will use a constant amount of memory for /// its internal buffer. The one requirement is that the internal buffer /// must be at least the size of the longest possible match. In most use /// cases, the default buffer size will be much larger than any individual /// match. /// /// # Panics /// /// This panics when `AhoCorasick::supports_stream` returns `false`. /// That is, this panics when this automaton's match semantics are not /// `MatchKind::Standard`. This restriction may be lifted in the future. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// # fn example() -> Result<(), ::std::io::Error> { /// let patterns = &["fox", "brown", "quick"]; /// let haystack = "The quick brown fox."; /// let replace_with = &["sloth", "grey", "slow"]; /// /// let ac = AhoCorasick::new(patterns); /// let mut result = vec![]; /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?; /// assert_eq!(b"The slow grey sloth.".to_vec(), result); /// # Ok(()) }; example().unwrap() /// ``` pub fn stream_replace_all( &self, rdr: R, wtr: W, replace_with: &[B], ) -> io::Result<()> where R: io::Read, W: io::Write, B: AsRef<[u8]>, { assert_eq!( replace_with.len(), self.pattern_count(), "stream_replace_all requires a replacement for every pattern \ in the automaton" ); self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| { wtr.write_all(replace_with[mat.pattern()].as_ref()) }) } /// Search the given reader and replace all matches of this automaton /// using the given closure. The result is written to the given /// writer. Matches correspond to the same matches as reported by /// [`find_iter`](struct.AhoCorasick.html#method.find_iter). /// /// The closure accepts three parameters: the match found, the text of /// the match and the writer with which to write the replaced text /// (if any). If the closure returns `true`, then it continues to the next /// match. If the closure returns false, then searching is stopped. /// /// After all matches are replaced, the writer is _not_ flushed. /// /// If there was a problem reading from the given reader or writing to the /// given writer, then the corresponding `io::Error` is returned and all /// replacement is stopped. /// /// When searching a stream, an internal buffer is used. Therefore, callers /// should avoiding providing a buffered reader, if possible. However, /// callers may want to provide a buffered writer. /// /// Searching a stream requires that the automaton was built with /// `MatchKind::Standard` semantics. If this automaton was constructed /// with leftmost semantics, then this method will panic. To determine /// whether this will panic at runtime, use the /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream) /// method. /// /// # Memory usage /// /// In general, searching streams will use a constant amount of memory for /// its internal buffer. The one requirement is that the internal buffer /// must be at least the size of the longest possible match. In most use /// cases, the default buffer size will be much larger than any individual /// match. /// /// # Panics /// /// This panics when `AhoCorasick::supports_stream` returns `false`. /// That is, this panics when this automaton's match semantics are not /// `MatchKind::Standard`. This restriction may be lifted in the future. /// /// # Examples /// /// Basic usage: /// /// ``` /// use std::io::Write; /// use aho_corasick::AhoCorasick; /// /// # fn example() -> Result<(), ::std::io::Error> { /// let patterns = &["fox", "brown", "quick"]; /// let haystack = "The quick brown fox."; /// /// let ac = AhoCorasick::new(patterns); /// let mut result = vec![]; /// ac.stream_replace_all_with( /// haystack.as_bytes(), /// &mut result, /// |mat, _, wtr| { /// wtr.write_all(mat.pattern().to_string().as_bytes()) /// }, /// )?; /// assert_eq!(b"The 2 1 0.".to_vec(), result); /// # Ok(()) }; example().unwrap() /// ``` pub fn stream_replace_all_with( &self, rdr: R, mut wtr: W, mut replace_with: F, ) -> io::Result<()> where R: io::Read, W: io::Write, F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>, { let mut it = StreamChunkIter::new(self, rdr); while let Some(result) = it.next() { let chunk = result?; match chunk { StreamChunk::NonMatch { bytes, .. } => { wtr.write_all(bytes)?; } StreamChunk::Match { bytes, mat } => { replace_with(&mat, bytes, &mut wtr)?; } } } Ok(()) } /// Returns the match kind used by this automaton. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasick, MatchKind}; /// /// let ac = AhoCorasick::new(&[ /// "foo", "bar", "quux", "baz", /// ]); /// assert_eq!(&MatchKind::Standard, ac.match_kind()); /// ``` pub fn match_kind(&self) -> &MatchKind { self.imp.match_kind() } /// Returns the length of the longest pattern matched by this automaton. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&[ /// "foo", "bar", "quux", "baz", /// ]); /// assert_eq!(4, ac.max_pattern_len()); /// ``` pub fn max_pattern_len(&self) -> usize { self.imp.max_pattern_len() } /// Return the total number of patterns matched by this automaton. /// /// This includes patterns that may never participate in a match. For /// example, if /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst) /// match semantics are used, and the patterns `Sam` and `Samwise` were /// used to build the automaton, then `Samwise` can never participate in a /// match because `Sam` will always take priority. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&[ /// "foo", "bar", "baz", /// ]); /// assert_eq!(3, ac.pattern_count()); /// ``` pub fn pattern_count(&self) -> usize { self.imp.pattern_count() } /// Returns true if and only if this automaton supports reporting /// overlapping matches. /// /// If this returns false and overlapping matches are requested, then it /// will result in a panic. /// /// Since leftmost matching is inherently incompatible with overlapping /// matches, only /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) /// supports overlapping matches. This is unlikely to change in the future. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::Standard) /// .build(&["foo", "bar", "baz"]); /// assert!(ac.supports_overlapping()); /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostFirst) /// .build(&["foo", "bar", "baz"]); /// assert!(!ac.supports_overlapping()); /// ``` pub fn supports_overlapping(&self) -> bool { self.match_kind.supports_overlapping() } /// Returns true if and only if this automaton supports stream searching. /// /// If this returns false and stream searching (or replacing) is attempted, /// then it will result in a panic. /// /// Currently, only /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard) /// supports streaming. This may be expanded in the future. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::Standard) /// .build(&["foo", "bar", "baz"]); /// assert!(ac.supports_stream()); /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostFirst) /// .build(&["foo", "bar", "baz"]); /// assert!(!ac.supports_stream()); /// ``` pub fn supports_stream(&self) -> bool { self.match_kind.supports_stream() } /// Returns the approximate total amount of heap used by this automaton, in /// units of bytes. /// /// # Examples /// /// This example shows the difference in heap usage between a few /// configurations: /// /// ```ignore /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let ac = AhoCorasickBuilder::new() /// .dfa(false) // default /// .build(&["foo", "bar", "baz"]); /// assert_eq!(10_336, ac.heap_bytes()); /// /// let ac = AhoCorasickBuilder::new() /// .dfa(false) // default /// .ascii_case_insensitive(true) /// .build(&["foo", "bar", "baz"]); /// assert_eq!(10_384, ac.heap_bytes()); /// /// let ac = AhoCorasickBuilder::new() /// .dfa(true) /// .byte_classes(false) /// .build(&["foo", "bar", "baz"]); /// assert_eq!(20_768, ac.heap_bytes()); /// /// let ac = AhoCorasickBuilder::new() /// .dfa(true) /// .byte_classes(true) // default /// .build(&["foo", "bar", "baz"]); /// assert_eq!(1_248, ac.heap_bytes()); /// /// let ac = AhoCorasickBuilder::new() /// .dfa(true) /// .ascii_case_insensitive(true) /// .build(&["foo", "bar", "baz"]); /// assert_eq!(1_248, ac.heap_bytes()); /// ``` pub fn heap_bytes(&self) -> usize { match self.imp { Imp::NFA(ref nfa) => nfa.heap_bytes(), Imp::DFA(ref dfa) => dfa.heap_bytes(), } } } /// The internal implementation of Aho-Corasick, which is either an NFA or /// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses /// more memory. #[derive(Clone, Debug)] enum Imp { NFA(NFA), DFA(DFA), } impl Imp { /// Returns the type of match semantics implemented by this automaton. fn match_kind(&self) -> &MatchKind { match *self { Imp::NFA(ref nfa) => nfa.match_kind(), Imp::DFA(ref dfa) => dfa.match_kind(), } } /// Returns the identifier of the start state. fn start_state(&self) -> S { match *self { Imp::NFA(ref nfa) => nfa.start_state(), Imp::DFA(ref dfa) => dfa.start_state(), } } /// The length, in bytes, of the longest pattern in this automaton. This /// information is useful for maintaining correct buffer sizes when /// searching on streams. fn max_pattern_len(&self) -> usize { match *self { Imp::NFA(ref nfa) => nfa.max_pattern_len(), Imp::DFA(ref dfa) => dfa.max_pattern_len(), } } /// The total number of patterns added to this automaton. This includes /// patterns that may never match. The maximum matching pattern that can be /// reported is exactly one less than this number. fn pattern_count(&self) -> usize { match *self { Imp::NFA(ref nfa) => nfa.pattern_count(), Imp::DFA(ref dfa) => dfa.pattern_count(), } } #[inline(always)] fn overlapping_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, match_index: &mut usize, ) -> Option { match *self { Imp::NFA(ref nfa) => nfa.overlapping_find_at( prestate, haystack, at, state_id, match_index, ), Imp::DFA(ref dfa) => dfa.overlapping_find_at( prestate, haystack, at, state_id, match_index, ), } } #[inline(always)] fn earliest_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, ) -> Option { match *self { Imp::NFA(ref nfa) => { nfa.earliest_find_at(prestate, haystack, at, state_id) } Imp::DFA(ref dfa) => { dfa.earliest_find_at(prestate, haystack, at, state_id) } } } #[inline(always)] fn find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option { match *self { Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at), Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at), } } } /// An iterator of non-overlapping matches in a particular haystack. /// /// This iterator yields matches according to the /// [`MatchKind`](enum.MatchKind.html) /// used by this automaton. /// /// This iterator is constructed via the /// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter) /// method. /// /// The type variable `S` refers to the representation used for state /// identifiers. (By default, this is `usize`.) /// /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. /// /// The lifetime `'b` refers to the lifetime of the haystack being searched. #[derive(Debug)] pub struct FindIter<'a, 'b, S: 'a + StateID> { fsm: &'a Imp, prestate: PrefilterState, haystack: &'b [u8], pos: usize, } impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> { fn new(ac: &'a AhoCorasick, haystack: &'b [u8]) -> FindIter<'a, 'b, S> { let prestate = PrefilterState::new(ac.max_pattern_len()); FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 } } } impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> { type Item = Match; fn next(&mut self) -> Option { if self.pos > self.haystack.len() { return None; } let result = self.fsm.find_at_no_state( &mut self.prestate, self.haystack, self.pos, ); let mat = match result { None => return None, Some(mat) => mat, }; if mat.end() == self.pos { // If the automaton can match the empty string and if we found an // empty match, then we need to forcefully move the position. self.pos += 1; } else { self.pos = mat.end(); } Some(mat) } } /// An iterator of overlapping matches in a particular haystack. /// /// This iterator will report all possible matches in a particular haystack, /// even when the matches overlap. /// /// This iterator is constructed via the /// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter) /// method. /// /// The type variable `S` refers to the representation used for state /// identifiers. (By default, this is `usize`.) /// /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. /// /// The lifetime `'b` refers to the lifetime of the haystack being searched. #[derive(Debug)] pub struct FindOverlappingIter<'a, 'b, S: 'a + StateID> { fsm: &'a Imp, prestate: PrefilterState, haystack: &'b [u8], pos: usize, last_match_end: usize, state_id: S, match_index: usize, } impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> { fn new( ac: &'a AhoCorasick, haystack: &'b [u8], ) -> FindOverlappingIter<'a, 'b, S> { assert!( ac.supports_overlapping(), "automaton does not support overlapping searches" ); let prestate = PrefilterState::new(ac.max_pattern_len()); FindOverlappingIter { fsm: &ac.imp, prestate, haystack, pos: 0, last_match_end: 0, state_id: ac.imp.start_state(), match_index: 0, } } } impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> { type Item = Match; fn next(&mut self) -> Option { let result = self.fsm.overlapping_find_at( &mut self.prestate, self.haystack, self.pos, &mut self.state_id, &mut self.match_index, ); match result { None => return None, Some(m) => { self.pos = m.end(); Some(m) } } } } /// An iterator that reports Aho-Corasick matches in a stream. /// /// This iterator yields elements of type `io::Result`, where an error /// is reported if there was a problem reading from the underlying stream. /// The iterator terminates only when the underlying stream reaches `EOF`. /// /// This iterator is constructed via the /// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter) /// method. /// /// The type variable `R` refers to the `io::Read` stream that is being read /// from. /// /// The type variable `S` refers to the representation used for state /// identifiers. (By default, this is `usize`.) /// /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton. #[derive(Debug)] pub struct StreamFindIter<'a, R, S: 'a + StateID> { it: StreamChunkIter<'a, R, S>, } impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> { fn new(ac: &'a AhoCorasick, rdr: R) -> StreamFindIter<'a, R, S> { StreamFindIter { it: StreamChunkIter::new(ac, rdr) } } } impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> { type Item = io::Result; fn next(&mut self) -> Option> { loop { match self.it.next() { None => return None, Some(Err(err)) => return Some(Err(err)), Some(Ok(StreamChunk::NonMatch { .. })) => {} Some(Ok(StreamChunk::Match { mat, .. })) => { return Some(Ok(mat)); } } } } } /// An iterator over chunks in an underlying reader. Each chunk either /// corresponds to non-matching bytes or matching bytes, but all bytes from /// the underlying reader are reported in sequence. There may be an arbitrary /// number of non-matching chunks before seeing a matching chunk. /// /// N.B. This does not actually implement Iterator because we need to borrow /// from the underlying reader. But conceptually, it's still an iterator. #[derive(Debug)] struct StreamChunkIter<'a, R, S: 'a + StateID> { /// The AC automaton. fsm: &'a Imp, /// State associated with this automaton's prefilter. It is a heuristic /// for stopping the prefilter if it's deemed ineffective. prestate: PrefilterState, /// The source of bytes we read from. rdr: R, /// A fixed size buffer. This is what we actually search. There are some /// invariants around the buffer's size, namely, it must be big enough to /// contain the longest possible match. buf: Buffer, /// The ID of the FSM state we're currently in. state_id: S, /// The current position at which to start the next search in `buf`. search_pos: usize, /// The absolute position of `search_pos`, where `0` corresponds to the /// position of the first byte read from `rdr`. absolute_pos: usize, /// The ending position of the last StreamChunk that was returned to the /// caller. This position is used to determine whether we need to emit /// non-matching bytes before emitting a match. report_pos: usize, /// A match that should be reported on the next call. pending_match: Option, /// Enabled only when the automaton can match the empty string. When /// enabled, we need to execute one final search after consuming the /// reader to find the trailing empty match. has_empty_match_at_end: bool, } /// A single chunk yielded by the stream chunk iterator. /// /// The `'r` lifetime refers to the lifetime of the stream chunk iterator. #[derive(Debug)] enum StreamChunk<'r> { /// A chunk that does not contain any matches. NonMatch { bytes: &'r [u8], start: usize }, /// A chunk that precisely contains a match. Match { bytes: &'r [u8], mat: Match }, } impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> { fn new(ac: &'a AhoCorasick, rdr: R) -> StreamChunkIter<'a, R, S> { assert!( ac.supports_stream(), "stream searching is only supported for Standard match semantics" ); let prestate = PrefilterState::new(ac.max_pattern_len()); let buf = Buffer::new(ac.imp.max_pattern_len()); let state_id = ac.imp.start_state(); StreamChunkIter { fsm: &ac.imp, prestate, rdr, buf, state_id, absolute_pos: 0, report_pos: 0, search_pos: 0, pending_match: None, has_empty_match_at_end: ac.is_match(""), } } fn next<'r>(&'r mut self) -> Option>> { loop { if let Some(mut mat) = self.pending_match.take() { let bytes = &self.buf.buffer()[mat.start()..mat.end()]; self.report_pos = mat.end(); mat = mat.increment(self.absolute_pos); return Some(Ok(StreamChunk::Match { bytes, mat })); } if self.search_pos >= self.buf.len() { if let Some(end) = self.unreported() { let bytes = &self.buf.buffer()[self.report_pos..end]; let start = self.absolute_pos + self.report_pos; self.report_pos = end; return Some(Ok(StreamChunk::NonMatch { bytes, start })); } if self.buf.len() >= self.buf.min_buffer_len() { // This is the point at which we roll our buffer, which we // only do if our buffer has at least the minimum amount of // bytes in it. Before rolling, we update our various // positions to be consistent with the buffer after it has // been rolled. self.report_pos -= self.buf.len() - self.buf.min_buffer_len(); self.absolute_pos += self.search_pos - self.buf.min_buffer_len(); self.search_pos = self.buf.min_buffer_len(); self.buf.roll(); } match self.buf.fill(&mut self.rdr) { Err(err) => return Some(Err(err)), Ok(false) => { // We've hit EOF, but if there are still some // unreported bytes remaining, return them now. if self.report_pos < self.buf.len() { let bytes = &self.buf.buffer()[self.report_pos..]; let start = self.absolute_pos + self.report_pos; self.report_pos = self.buf.len(); let chunk = StreamChunk::NonMatch { bytes, start }; return Some(Ok(chunk)); } else { // We've reported everything, but there might still // be a match at the very last position. if !self.has_empty_match_at_end { return None; } // fallthrough for another search to get trailing // empty matches self.has_empty_match_at_end = false; } } Ok(true) => {} } } let result = self.fsm.earliest_find_at( &mut self.prestate, self.buf.buffer(), self.search_pos, &mut self.state_id, ); match result { None => { self.search_pos = self.buf.len(); } Some(mat) => { self.state_id = self.fsm.start_state(); if mat.end() == self.search_pos { // If the automaton can match the empty string and if // we found an empty match, then we need to forcefully // move the position. self.search_pos += 1; } else { self.search_pos = mat.end(); } self.pending_match = Some(mat.clone()); if self.report_pos < mat.start() { let bytes = &self.buf.buffer()[self.report_pos..mat.start()]; let start = self.absolute_pos + self.report_pos; self.report_pos = mat.start(); let chunk = StreamChunk::NonMatch { bytes, start }; return Some(Ok(chunk)); } } } } } fn unreported(&self) -> Option { let end = self.search_pos.saturating_sub(self.buf.min_buffer_len()); if self.report_pos < end { Some(end) } else { None } } } /// A builder for configuring an Aho-Corasick automaton. #[derive(Clone, Debug)] pub struct AhoCorasickBuilder { nfa_builder: nfa::Builder, dfa_builder: dfa::Builder, dfa: bool, } impl Default for AhoCorasickBuilder { fn default() -> AhoCorasickBuilder { AhoCorasickBuilder::new() } } impl AhoCorasickBuilder { /// Create a new builder for configuring an Aho-Corasick automaton. /// /// If you don't need fine grained configuration or aren't sure which knobs /// to set, try using /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured) /// instead. pub fn new() -> AhoCorasickBuilder { AhoCorasickBuilder { nfa_builder: nfa::Builder::new(), dfa_builder: dfa::Builder::new(), dfa: false, } } /// Build an Aho-Corasick automaton using the configuration set on this /// builder. /// /// A builder may be reused to create more automatons. /// /// This method will use the default for representing internal state /// identifiers, which is `usize`. This guarantees that building the /// automaton will succeed and is generally a good default, but can make /// the size of the automaton 2-8 times bigger than it needs to be, /// depending on your target platform. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasickBuilder; /// /// let patterns = &["foo", "bar", "baz"]; /// let ac = AhoCorasickBuilder::new() /// .build(patterns); /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); /// ``` pub fn build(&self, patterns: I) -> AhoCorasick where I: IntoIterator, P: AsRef<[u8]>, { // The builder only returns an error if the chosen state ID // representation is too small to fit all of the given patterns. In // this case, since we fix the representation to usize, it will always // work because it's impossible to overflow usize since the underlying // storage would OOM long before that happens. self.build_with_size::(patterns) .expect("usize state ID type should always work") } /// Build an Aho-Corasick automaton using the configuration set on this /// builder with a specific state identifier representation. This only has /// an effect when the `dfa` option is enabled. /// /// Generally, the choices for a state identifier representation are /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default. /// The advantage of choosing a smaller state identifier representation /// is that the automaton produced will be smaller. This might be /// beneficial for just generally using less space, or might even allow it /// to fit more of the automaton in your CPU's cache, leading to overall /// better search performance. /// /// Unlike the standard `build` method, this can report an error if the /// state identifier representation cannot support the size of the /// automaton. /// /// Note that the state identifier representation is determined by the /// `S` type variable. This requires a type hint of some sort, either /// by specifying the return type or using the turbofish, e.g., /// `build_with_size::(...)`. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder}; /// /// # fn example() -> Result<(), ::aho_corasick::Error> { /// let patterns = &["foo", "bar", "baz"]; /// let ac: AhoCorasick = AhoCorasickBuilder::new() /// .build_with_size(patterns)?; /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); /// # Ok(()) }; example().unwrap() /// ``` /// /// Or alternatively, with turbofish: /// /// ``` /// use aho_corasick::AhoCorasickBuilder; /// /// # fn example() -> Result<(), ::aho_corasick::Error> { /// let patterns = &["foo", "bar", "baz"]; /// let ac = AhoCorasickBuilder::new() /// .build_with_size::(patterns)?; /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); /// # Ok(()) }; example().unwrap() /// ``` pub fn build_with_size( &self, patterns: I, ) -> Result> where S: StateID, I: IntoIterator, P: AsRef<[u8]>, { let nfa = self.nfa_builder.build(patterns)?; let match_kind = nfa.match_kind().clone(); let imp = if self.dfa { let dfa = self.dfa_builder.build(&nfa)?; Imp::DFA(dfa) } else { Imp::NFA(nfa) }; Ok(AhoCorasick { imp, match_kind }) } /// Automatically configure the settings on this builder according to the /// patterns that will be used to construct the automaton. /// /// The idea here is to balance space and time automatically. That is, when /// searching a small number of patterns, this will attempt to use the /// fastest possible configuration since the total space required will be /// small anyway. As the number of patterns grows, this will fall back to /// slower configurations that use less space. /// /// This is guaranteed to never set `match_kind`, but any other option may /// be overridden. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasickBuilder; /// /// let patterns = &["foo", "bar", "baz"]; /// let ac = AhoCorasickBuilder::new() /// .auto_configure(patterns) /// .build(patterns); /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern())); /// ``` pub fn auto_configure>( &mut self, patterns: &[B], ) -> &mut AhoCorasickBuilder { // N.B. Currently we only use the length of `patterns` to make a // decision here, and could therefore ask for an `ExactSizeIterator` // instead. But it's conceivable that we might adapt this to look at // the total number of bytes, which would requires a second pass. // // The logic here is fairly rudimentary at the moment, but probably // OK. The idea here is to use the fastest thing possible for a small // number of patterns. That is, a DFA with no byte classes, since byte // classes require an extra indirection for every byte searched. With a // moderate number of patterns, we still want a DFA, but save on both // space and compilation time by enabling byte classes. Finally, fall // back to the slower but smaller NFA. if patterns.len() <= 100 { // N.B. Using byte classes can actually be faster by improving // locality, but this only really applies for multi-megabyte // automata (i.e., automata that don't fit in your CPU's cache). self.dfa(true).byte_classes(false); } else if patterns.len() <= 5000 { self.dfa(true); } self } /// Set the desired match semantics. /// /// The default is `MatchKind::Standard`, which corresponds to the match /// semantics supported by the standard textbook description of the /// Aho-Corasick algorithm. Namely, matches are reported as soon as they /// are found. Moreover, this is the only way to get overlapping matches /// or do stream searching. /// /// The other kinds of match semantics that are supported are /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former /// corresponds to the match you would get if you were to try to match /// each pattern at each position in the haystack in the same order that /// you give to the automaton. That is, it returns the leftmost match /// corresponding the earliest pattern given to the automaton. The latter /// corresponds to finding the longest possible match among all leftmost /// matches. /// /// For more details on match semantics, see the /// [documentation for `MatchKind`](enum.MatchKind.html). /// /// # Examples /// /// In these examples, we demonstrate the differences between match /// semantics for a particular set of patterns in a specific order: /// `b`, `abc`, `abcd`. /// /// Standard semantics: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::Standard) // default, not necessary /// .build(patterns); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("b", &haystack[mat.start()..mat.end()]); /// ``` /// /// Leftmost-first semantics: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostFirst) /// .build(patterns); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("abc", &haystack[mat.start()..mat.end()]); /// ``` /// /// Leftmost-longest semantics: /// /// ``` /// use aho_corasick::{AhoCorasickBuilder, MatchKind}; /// /// let patterns = &["b", "abc", "abcd"]; /// let haystack = "abcd"; /// /// let ac = AhoCorasickBuilder::new() /// .match_kind(MatchKind::LeftmostLongest) /// .build(patterns); /// let mat = ac.find(haystack).expect("should have a match"); /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]); /// ``` pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder { self.nfa_builder.match_kind(kind); self } /// Enable anchored mode, which requires all matches to start at the /// first position in a haystack. /// /// This option is disabled by default. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasickBuilder; /// /// let patterns = &["foo", "bar"]; /// let haystack = "foobar"; /// /// let ac = AhoCorasickBuilder::new() /// .anchored(true) /// .build(patterns); /// assert_eq!(1, ac.find_iter(haystack).count()); /// ``` /// /// When searching for overlapping matches, all matches that start at /// the beginning of a haystack will be reported: /// /// ``` /// use aho_corasick::AhoCorasickBuilder; /// /// let patterns = &["foo", "foofoo"]; /// let haystack = "foofoo"; /// /// let ac = AhoCorasickBuilder::new() /// .anchored(true) /// .build(patterns); /// assert_eq!(2, ac.find_overlapping_iter(haystack).count()); /// // A non-anchored search would return 3 matches. /// ``` pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder { self.nfa_builder.anchored(yes); self } /// Enable ASCII-aware case insensitive matching. /// /// When this option is enabled, searching will be performed without /// respect to case for ASCII letters (`a-z` and `A-Z`) only. /// /// Enabling this option does not change the search algorithm, but it may /// increase the size of the automaton. /// /// **NOTE:** In the future, support for full Unicode case insensitivity /// may be added, but ASCII case insensitivity is comparatively much /// simpler to add. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasickBuilder; /// /// let patterns = &["FOO", "bAr", "BaZ"]; /// let haystack = "foo bar baz"; /// /// let ac = AhoCorasickBuilder::new() /// .ascii_case_insensitive(true) /// .build(patterns); /// assert_eq!(3, ac.find_iter(haystack).count()); /// ``` pub fn ascii_case_insensitive( &mut self, yes: bool, ) -> &mut AhoCorasickBuilder { self.nfa_builder.ascii_case_insensitive(yes); self } /// Set the limit on how many NFA states use a dense representation for /// their transitions. /// /// A dense representation uses more space, but supports faster access to /// transitions at search time. Thus, this setting permits the control of a /// space vs time trade off when using the NFA variant of Aho-Corasick. /// /// This limit is expressed in terms of the depth of a state, i.e., the /// number of transitions from the starting state of the NFA. The idea is /// that most of the time searching will be spent near the starting state /// of the automaton, so states near the start state should use a dense /// representation. States further away from the start state would then use /// a sparse representation, which uses less space but is slower to access /// transitions at search time. /// /// By default, this is set to a low but non-zero number. /// /// This setting has no effect if the `dfa` option is enabled. pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder { self.nfa_builder.dense_depth(depth); self } /// Compile the standard Aho-Corasick automaton into a deterministic finite /// automaton (DFA). /// /// When this is disabled (which is the default), then a non-deterministic /// finite automaton (NFA) is used instead. /// /// The main benefit to a DFA is that it can execute searches more quickly /// than a DFA (perhaps 2-4 times as fast). The main drawback is that the /// DFA uses more space and can take much longer to build. /// /// Enabling this option does not change the time complexity for /// constructing the Aho-Corasick automaton (which is `O(p)` where /// `p` is the total number of patterns being compiled). Enabling this /// option does however reduce the time complexity of non-overlapping /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the /// haystack. /// /// In general, it's a good idea to enable this if you're searching a /// small number of fairly short patterns (~1000), or if you want the /// fastest possible search without regard to compilation time or space /// usage. pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder { self.dfa = yes; self } /// Enable heuristic prefilter optimizations. /// /// When enabled, searching will attempt to quickly skip to match /// candidates using specialized literal search routines. A prefilter /// cannot always be used, and is generally treated as a heuristic. It /// can be useful to disable this if the prefilter is observed to be /// sub-optimal for a particular workload. /// /// This is enabled by default. pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder { self.nfa_builder.prefilter(yes); self } /// Shrink the size of the transition alphabet by mapping bytes to their /// equivalence classes. This only has an effect when the `dfa` option is /// enabled. /// /// When enabled, each a DFA will use a map from all possible bytes /// to their corresponding equivalence class. Each equivalence class /// represents a set of bytes that does not discriminate between a match /// and a non-match in the DFA. For example, the patterns `bar` and `baz` /// have at least five equivalence classes: singleton sets of `b`, `a`, `r` /// and `z`, and a final set that contains every other byte. /// /// The advantage of this map is that the size of the transition table can /// be reduced drastically from `#states * 256 * sizeof(id)` to /// `#states * k * sizeof(id)` where `k` is the number of equivalence /// classes. As a result, total space usage can decrease substantially. /// Moreover, since a smaller alphabet is used, compilation becomes faster /// as well. /// /// The disadvantage of this map is that every byte searched must be /// passed through this map before it can be used to determine the next /// transition. This has a small match time performance cost. However, if /// the DFA is otherwise very large without byte classes, then using byte /// classes can greatly improve memory locality and thus lead to better /// overall performance. /// /// This option is enabled by default. pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder { self.dfa_builder.byte_classes(yes); self } /// Premultiply state identifiers in the transition table. This only has /// an effect when the `dfa` option is enabled. /// /// When enabled, state identifiers are premultiplied to point to their /// corresponding row in the transition table. That is, given the `i`th /// state, its corresponding premultiplied identifier is `i * k` where `k` /// is the alphabet size of the automaton. (The alphabet size is at most /// 256, but is in practice smaller if byte classes is enabled.) /// /// When state identifiers are not premultiplied, then the identifier of /// the `i`th state is `i`. /// /// The advantage of premultiplying state identifiers is that is saves a /// multiplication instruction per byte when searching with a DFA. This has /// been observed to lead to a 20% performance benefit in micro-benchmarks. /// /// The primary disadvantage of premultiplying state identifiers is /// that they require a larger integer size to represent. For example, /// if the DFA has 200 states, then its premultiplied form requires 16 /// bits to represent every possible state identifier, where as its /// non-premultiplied form only requires 8 bits. /// /// This option is enabled by default. pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder { self.dfa_builder.premultiply(yes); self } } /// A knob for controlling the match semantics of an Aho-Corasick automaton. /// /// There are two generally different ways that Aho-Corasick automatons can /// report matches. The first way is the "standard" approach that results from /// implementing most textbook explanations of Aho-Corasick. The second way is /// to report only the leftmost non-overlapping matches. The leftmost approach /// is in turn split into two different ways of resolving ambiguous matches: /// leftmost-first and leftmost-longest. /// /// The `Standard` match kind is the default and is the only one that supports /// overlapping matches and stream searching. (Trying to find overlapping /// or streaming matches using leftmost match semantics will result in a /// panic.) The `Standard` match kind will report matches as they are seen. /// When searching for overlapping matches, then all possible matches are /// reported. When searching for non-overlapping matches, the first match seen /// is reported. For example, for non-overlapping matches, given the patterns /// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is /// reported since it is detected first. The `abcd` match is never reported /// since it overlaps with the `b` match. /// /// In contrast, the leftmost match kind always prefers the leftmost match /// among all possible matches. Given the same example as above with `abcd` and /// `b` as patterns and `abcdef` as the subject string, the leftmost match is /// `abcd` since it begins before the `b` match, even though the `b` match is /// detected before the `abcd` match. In this case, the `b` match is not /// reported at all since it overlaps with the `abcd` match. /// /// The difference between leftmost-first and leftmost-longest is in how they /// resolve ambiguous matches when there are multiple leftmost matches to /// choose from. Leftmost-first always chooses the pattern that was provided /// earliest, where as leftmost-longest always chooses the longest matching /// pattern. For example, given the patterns `a` and `ab` and the subject /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match /// is `ab`. Conversely, if the patterns were given in reverse order, i.e., /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches /// would be `ab`. Stated differently, the leftmost-first match depends on the /// order in which the patterns were given to the Aho-Corasick automaton. /// Because of that, when leftmost-first matching is used, if a pattern `A` /// that appears before a pattern `B` is a prefix of `B`, then it is impossible /// to ever observe a match of `B`. /// /// If you're not sure which match kind to pick, then stick with the standard /// kind, which is the default. In particular, if you need overlapping or /// streaming matches, then you _must_ use the standard kind. The leftmost /// kinds are useful in specific circumstances. For example, leftmost-first can /// be very useful as a way to implement match priority based on the order of /// patterns given and leftmost-longest can be useful for dictionary searching /// such that only the longest matching words are reported. /// /// # Relationship with regular expression alternations /// /// Understanding match semantics can be a little tricky, and one easy way /// to conceptualize non-overlapping matches from an Aho-Corasick automaton /// is to think about them as a simple alternation of literals in a regular /// expression. For example, let's say we wanted to match the strings /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It /// turns out that regular expression engines have two different ways of /// matching this alternation. The first way, leftmost-longest, is commonly /// found in POSIX compatible implementations of regular expressions (such as /// `grep`). The second way, leftmost-first, is commonly found in backtracking /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's /// regex engine do not use backtracking, but still implement leftmost-first /// semantics in an effort to match the behavior of dominant backtracking /// regex engines such as those found in Perl, Ruby, Python, Javascript and /// PHP.) /// /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex /// will match `Samwise` because it is the longest possible match, but a /// Perl-like regex will match `Sam` since it appears earlier in the /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine /// will never match `Samwise` since `Sam` will always have higher priority. /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is /// still longest match, but it also appears earlier than `Sam`. /// /// The "standard" match semantics of Aho-Corasick generally don't correspond /// to the match semantics of any large group of regex implementations, so /// there's no direct analogy that can be made here. Standard match semantics /// are generally useful for overlapping matches, or if you just want to see /// matches as they are detected. /// /// The main conclusion to draw from this section is that the match semantics /// can be tweaked to precisely match either Perl-like regex alternations or /// POSIX regex alternations. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum MatchKind { /// Use standard match semantics, which support overlapping matches. When /// used with non-overlapping matches, matches are reported as they are /// seen. Standard, /// Use leftmost-first match semantics, which reports leftmost matches. /// When there are multiple possible leftmost matches, the match /// corresponding to the pattern that appeared earlier when constructing /// the automaton is reported. /// /// This does **not** support overlapping matches or stream searching. If /// this match kind is used, attempting to find overlapping matches or /// stream matches will panic. LeftmostFirst, /// Use leftmost-longest match semantics, which reports leftmost matches. /// When there are multiple possible leftmost matches, the longest match /// is chosen. /// /// This does **not** support overlapping matches or stream searching. If /// this match kind is used, attempting to find overlapping matches or /// stream matches will panic. LeftmostLongest, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } /// The default match kind is `MatchKind::Standard`. impl Default for MatchKind { fn default() -> MatchKind { MatchKind::Standard } } impl MatchKind { fn supports_overlapping(&self) -> bool { self.is_standard() } fn supports_stream(&self) -> bool { // TODO: It may be possible to support this. It's hard. // // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838 self.is_standard() } pub(crate) fn is_standard(&self) -> bool { *self == MatchKind::Standard } pub(crate) fn is_leftmost(&self) -> bool { *self == MatchKind::LeftmostFirst || *self == MatchKind::LeftmostLongest } pub(crate) fn is_leftmost_first(&self) -> bool { *self == MatchKind::LeftmostFirst } /// Convert this match kind into a packed match kind. If this match kind /// corresponds to standard semantics, then this returns None, since /// packed searching does not support standard semantics. pub(crate) fn as_packed(&self) -> Option { match *self { MatchKind::Standard => None, MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst), MatchKind::LeftmostLongest => { Some(packed::MatchKind::LeftmostLongest) } MatchKind::__Nonexhaustive => unreachable!(), } } } #[cfg(test)] mod tests { use super::*; #[test] fn oibits() { use std::panic::{RefUnwindSafe, UnwindSafe}; fn assert_send() {} fn assert_sync() {} fn assert_unwind_safe() {} assert_send::(); assert_sync::(); assert_unwind_safe::(); assert_send::(); assert_sync::(); assert_unwind_safe::(); } } aho-corasick-0.7.8/src/automaton.rs010066400017500001731000000574161361627453200155050ustar0000000000000000use ahocorasick::MatchKind; use prefilter::{self, Candidate, Prefilter, PrefilterState}; use state_id::{dead_id, fail_id, StateID}; use Match; // NOTE: This trait essentially started as a copy of the same trait from from // regex-automata, with some wording changed since we use this trait for // NFAs in addition to DFAs in this crate. Additionally, we do not export // this trait. It's only used internally to reduce code duplication. The // regex-automata crate needs to expose it because its Regex type is generic // over implementations of this trait. In this crate, we encapsulate everything // behind the AhoCorasick type. // // This trait is a bit of a mess, but it's not quite clear how to fix it. // Basically, there are several competing concerns: // // * We need performance, so everything effectively needs to get monomorphized. // * There are several variations on searching Aho-Corasick automatons: // overlapping, standard and leftmost. Overlapping and standard are somewhat // combined together below, but there is no real way to combine standard with // leftmost. Namely, leftmost requires continuing a search even after a match // is found, in order to correctly disambiguate a match. // * On top of that, *sometimes* callers want to know which state the automaton // is in after searching. This is principally useful for overlapping and // stream searches. However, when callers don't care about this, we really // do not want to be forced to compute it, since it sometimes requires extra // work. Thus, there are effectively two copies of leftmost searching: one // for tracking the state ID and one that doesn't. We should ideally do the // same for standard searching, but my sanity stopped me. // SAFETY RATIONALE: Previously, the code below went to some length to remove // all bounds checks. This generally produced tighter assembly and lead to // 20-50% improvements in micro-benchmarks on corpora made up of random // characters. This somewhat makes sense, since the branch predictor is going // to be at its worse on random text. // // However, using the aho-corasick-debug tool and manually benchmarking // different inputs, the code *with* bounds checks actually wound up being // slightly faster: // // $ cat input // Sherlock Holmes // John Watson // Professor Moriarty // Irene Adler // Mary Watson // // $ aho-corasick-debug-safe \ // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa // pattern read time: 32.824µs // automaton build time: 444.687µs // automaton heap usage: 72392 bytes // match count: 639 // count time: 1.809961702s // // $ aho-corasick-debug-master \ // input OpenSubtitles2018.raw.sample.en --kind leftmost-first --dfa // pattern read time: 31.425µs // automaton build time: 317.434µs // automaton heap usage: 72392 bytes // match count: 639 // count time: 2.059157705s // // I was able to reproduce this result on two different machines (an i5 and // an i7). Therefore, we go the route of safe code for now. /// A trait describing the interface of an Aho-Corasick finite state machine. /// /// Every automaton has exactly one fail state, one dead state and exactly one /// start state. Generally, these correspond to the first, second and third /// states, respectively. The failure state is always treated as a sentinel. /// That is, no correct Aho-Corasick automaton will ever transition into the /// fail state. The dead state, however, can be transitioned into, but only /// when leftmost-first or leftmost-longest match semantics are enabled and /// only when at least one match has been observed. /// /// Every automaton also has one or more match states, such that /// `Automaton::is_match_state(id)` returns `true` if and only if `id` /// corresponds to a match state. pub trait Automaton { /// The representation used for state identifiers in this automaton. /// /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. type ID: StateID; /// The type of matching that should be done. fn match_kind(&self) -> &MatchKind; /// Returns true if and only if this automaton uses anchored searches. fn anchored(&self) -> bool; /// An optional prefilter for quickly skipping to the next candidate match. /// A prefilter must report at least every match, although it may report /// positions that do not correspond to a match. That is, it must not allow /// false negatives, but can allow false positives. /// /// Currently, a prefilter only runs when the automaton is in the start /// state. That is, the position reported by a prefilter should always /// correspond to the start of a potential match. fn prefilter(&self) -> Option<&dyn Prefilter>; /// Return the identifier of this automaton's start state. fn start_state(&self) -> Self::ID; /// Returns true if and only if the given state identifier refers to a /// valid state. fn is_valid(&self, id: Self::ID) -> bool; /// Returns true if and only if the given identifier corresponds to a match /// state. /// /// The state ID given must be valid, or else implementors may panic. fn is_match_state(&self, id: Self::ID) -> bool; /// Returns true if and only if the given identifier corresponds to a state /// that is either the dead state or a match state. /// /// Depending on the implementation of the automaton, this routine can /// be used to save a branch in the core matching loop. Nevertheless, /// `is_match_state(id) || id == dead_id()` is always a valid /// implementation. Indeed, this is the default implementation. /// /// The state ID given must be valid, or else implementors may panic. fn is_match_or_dead_state(&self, id: Self::ID) -> bool { id == dead_id() || self.is_match_state(id) } /// If the given state is a match state, return the match corresponding /// to the given match index. `end` must be the ending position of the /// detected match. If no match exists or if `match_index` exceeds the /// number of matches in this state, then `None` is returned. /// /// The state ID given must be valid, or else implementors may panic. /// /// If the given state ID is correct and if the `match_index` is less than /// the number of matches for that state, then this is guaranteed to return /// a match. fn get_match( &self, id: Self::ID, match_index: usize, end: usize, ) -> Option; /// Returns the number of matches for the given state. If the given state /// is not a match state, then this returns 0. /// /// The state ID given must be valid, or else implementors must panic. fn match_count(&self, id: Self::ID) -> usize; /// Given the current state that this automaton is in and the next input /// byte, this method returns the identifier of the next state. The /// identifier returned must always be valid and may never correspond to /// the fail state. The returned identifier may, however, point to the /// dead state. /// /// This is not safe so that implementors may look up the next state /// without memory safety checks such as bounds checks. As such, callers /// must ensure that the given identifier corresponds to a valid automaton /// state. Implementors must, in turn, ensure that this routine is safe for /// all valid state identifiers and for all possible `u8` values. fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; /// Like next_state, but debug_asserts that the underlying /// implementation never returns a `fail_id()` for the next state. fn next_state_no_fail(&self, current: Self::ID, input: u8) -> Self::ID { let next = self.next_state(current, input); // We should never see a transition to the failure state. debug_assert!( next != fail_id(), "automaton should never return fail_id for next state" ); next } /// Execute a search using standard match semantics. /// /// This can be used even when the automaton was constructed with leftmost /// match semantics when you want to find the earliest possible match. This /// can also be used as part of an overlapping search implementation. /// /// N.B. This does not report a match if `state_id` is given as a matching /// state. As such, this should not be used directly. #[inline(always)] fn standard_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option { if let Some(pre) = self.prefilter() { self.standard_find_at_imp( prestate, Some(pre), haystack, at, state_id, ) } else { self.standard_find_at_imp(prestate, None, haystack, at, state_id) } } // It's important for this to always be inlined. Namely, its only caller // is standard_find_at, and the inlining should remove the case analysis // for prefilter scanning when there is no prefilter available. #[inline(always)] fn standard_find_at_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, state_id: &mut Self::ID, ) -> Option { while at < haystack.len() { if let Some(pre) = prefilter { if prestate.is_effective(at) && *state_id == self.start_state() { let c = prefilter::next(prestate, pre, haystack, at) .into_option(); match c { None => return None, Some(i) => { at = i; } } } } // CORRECTNESS: next_state is correct for all possible u8 values, // so the only thing we're concerned about is the validity of // `state_id`. `state_id` either comes from the caller (in which // case, we assume it is correct), or it comes from the return // value of next_state, which is guaranteed to be correct. *state_id = self.next_state_no_fail(*state_id, haystack[at]); at += 1; // This routine always quits immediately after seeing a // match, and since dead states can only come after seeing // a match, seeing a dead state here is impossible. (Unless // we have an anchored automaton, in which case, dead states // are used to stop a search.) debug_assert!( *state_id != dead_id() || self.anchored(), "standard find should never see a dead state" ); if self.is_match_or_dead_state(*state_id) { return if *state_id == dead_id() { None } else { self.get_match(*state_id, 0, at) }; } } None } /// Execute a search using leftmost (either first or longest) match /// semantics. /// /// The principle difference between searching with standard semantics and /// searching with leftmost semantics is that leftmost searching will /// continue searching even after a match has been found. Once a match /// is found, the search does not stop until either the haystack has been /// exhausted or a dead state is observed in the automaton. (Dead states /// only exist in automatons constructed with leftmost semantics.) That is, /// we rely on the construction of the automaton to tell us when to quit. #[inline(never)] fn leftmost_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option { if let Some(pre) = self.prefilter() { self.leftmost_find_at_imp( prestate, Some(pre), haystack, at, state_id, ) } else { self.leftmost_find_at_imp(prestate, None, haystack, at, state_id) } } // It's important for this to always be inlined. Namely, its only caller // is leftmost_find_at, and the inlining should remove the case analysis // for prefilter scanning when there is no prefilter available. #[inline(always)] fn leftmost_find_at_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, state_id: &mut Self::ID, ) -> Option { debug_assert!(self.match_kind().is_leftmost()); if self.anchored() && at > 0 && *state_id == self.start_state() { return None; } let mut last_match = self.get_match(*state_id, 0, at); while at < haystack.len() { if let Some(pre) = prefilter { if prestate.is_effective(at) && *state_id == self.start_state() { let c = prefilter::next(prestate, pre, haystack, at) .into_option(); match c { None => return None, Some(i) => { at = i; } } } } // CORRECTNESS: next_state is correct for all possible u8 values, // so the only thing we're concerned about is the validity of // `state_id`. `state_id` either comes from the caller (in which // case, we assume it is correct), or it comes from the return // value of next_state, which is guaranteed to be correct. *state_id = self.next_state_no_fail(*state_id, haystack[at]); at += 1; if self.is_match_or_dead_state(*state_id) { if *state_id == dead_id() { // The only way to enter into a dead state is if a match // has been found, so we assert as much. This is different // from normal automata, where you might enter a dead state // if you know a subsequent match will never be found // (regardless of whether a match has already been found). // For Aho-Corasick, it is built so that we can match at // any position, so the possibility of a match always // exists. // // (Unless we have an anchored automaton, in which case, // dead states are used to stop a search.) debug_assert!( last_match.is_some() || self.anchored(), "failure state should only be seen after match" ); return last_match; } last_match = self.get_match(*state_id, 0, at); } } last_match } /// This is like leftmost_find_at, but does not need to track a caller /// provided state id. In other words, the only output of this routine is a /// match, if one exists. /// /// It is regrettable that we need to effectively copy a chunk of /// implementation twice, but when we don't need to track the state ID, we /// can allow the prefilter to report matches immediately without having /// to re-confirm them with the automaton. The re-confirmation step is /// necessary in leftmost_find_at because tracing through the automaton is /// the only way to correctly set the state ID. (Perhaps an alternative /// would be to keep a map from pattern ID to matching state ID, but that /// complicates the code and still doesn't permit us to defer to the /// prefilter entirely when possible.) /// /// I did try a few things to avoid the code duplication here, but nothing /// optimized as well as this approach. (In microbenchmarks, there was /// about a 25% difference.) #[inline(never)] fn leftmost_find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option { if let Some(pre) = self.prefilter() { self.leftmost_find_at_no_state_imp( prestate, Some(pre), haystack, at, ) } else { self.leftmost_find_at_no_state_imp(prestate, None, haystack, at) } } // It's important for this to always be inlined. Namely, its only caller // is leftmost_find_at_no_state, and the inlining should remove the case // analysis for prefilter scanning when there is no prefilter available. #[inline(always)] fn leftmost_find_at_no_state_imp( &self, prestate: &mut PrefilterState, prefilter: Option<&dyn Prefilter>, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(self.match_kind().is_leftmost()); if self.anchored() && at > 0 { return None; } // If our prefilter handles confirmation of matches 100% of the // time, and since we don't need to track state IDs, we can avoid // Aho-Corasick completely. if let Some(pre) = prefilter { // We should never have a prefilter during an anchored search. debug_assert!(!self.anchored()); if !pre.reports_false_positives() { return match pre.next_candidate(prestate, haystack, at) { Candidate::None => None, Candidate::Match(m) => Some(m), Candidate::PossibleStartOfMatch(_) => unreachable!(), }; } } let mut state_id = self.start_state(); let mut last_match = self.get_match(state_id, 0, at); while at < haystack.len() { if let Some(pre) = prefilter { if prestate.is_effective(at) && state_id == self.start_state() { match prefilter::next(prestate, pre, haystack, at) { Candidate::None => return None, // Since we aren't tracking a state ID, we can // quit early once we know we have a match. Candidate::Match(m) => return Some(m), Candidate::PossibleStartOfMatch(i) => { at = i; } } } } // CORRECTNESS: next_state is correct for all possible u8 values, // so the only thing we're concerned about is the validity of // `state_id`. `state_id` either comes from the caller (in which // case, we assume it is correct), or it comes from the return // value of next_state, which is guaranteed to be correct. state_id = self.next_state_no_fail(state_id, haystack[at]); at += 1; if self.is_match_or_dead_state(state_id) { if state_id == dead_id() { // The only way to enter into a dead state is if a // match has been found, so we assert as much. This // is different from normal automata, where you might // enter a dead state if you know a subsequent match // will never be found (regardless of whether a match // has already been found). For Aho-Corasick, it is // built so that we can match at any position, so the // possibility of a match always exists. // // (Unless we have an anchored automaton, in which // case, dead states are used to stop a search.) debug_assert!( last_match.is_some() || self.anchored(), "failure state should only be seen after match" ); return last_match; } last_match = self.get_match(state_id, 0, at); } } last_match } /// Execute an overlapping search. /// /// When executing an overlapping match, the previous state ID in addition /// to the previous match index should be given. If there are more matches /// at the given state, then the match is reported and the given index is /// incremented. #[inline(always)] fn overlapping_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, match_index: &mut usize, ) -> Option { if self.anchored() && at > 0 && *state_id == self.start_state() { return None; } let match_count = self.match_count(*state_id); if *match_index < match_count { // This is guaranteed to return a match since // match_index < match_count. let result = self.get_match(*state_id, *match_index, at); debug_assert!(result.is_some(), "must be a match"); *match_index += 1; return result; } *match_index = 0; match self.standard_find_at(prestate, haystack, at, state_id) { None => None, Some(m) => { *match_index = 1; Some(m) } } } /// Return the earliest match found. This returns as soon as we know that /// we have a match. As such, this does not necessarily correspond to the /// leftmost starting match, but rather, the leftmost position at which a /// match ends. #[inline(always)] fn earliest_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option { if *state_id == self.start_state() { if self.anchored() && at > 0 { return None; } if let Some(m) = self.get_match(*state_id, 0, at) { return Some(m); } } self.standard_find_at(prestate, haystack, at, state_id) } /// A convenience function for finding the next match according to the /// match semantics of this automaton. For standard match semantics, this /// finds the earliest match. Otherwise, the leftmost match is found. #[inline(always)] fn find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut Self::ID, ) -> Option { match *self.match_kind() { MatchKind::Standard => { self.earliest_find_at(prestate, haystack, at, state_id) } MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { self.leftmost_find_at(prestate, haystack, at, state_id) } MatchKind::__Nonexhaustive => unreachable!(), } } /// Like find_at, but does not track state identifiers. This permits some /// optimizations when a prefilter that confirms its own matches is /// present. #[inline(always)] fn find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option { match *self.match_kind() { MatchKind::Standard => { let mut state = self.start_state(); self.earliest_find_at(prestate, haystack, at, &mut state) } MatchKind::LeftmostFirst | MatchKind::LeftmostLongest => { self.leftmost_find_at_no_state(prestate, haystack, at) } MatchKind::__Nonexhaustive => unreachable!(), } } } aho-corasick-0.7.8/src/buffer.rs010066400017500001731000000123461361627453200147400ustar0000000000000000use std::cmp; use std::io; use std::ptr; /// The default buffer capacity that we use for the stream buffer. const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1 << 10); // 8 KB /// A fairly simple roll buffer for supporting stream searches. /// /// This buffer acts as a temporary place to store a fixed amount of data when /// reading from a stream. Its central purpose is to allow "rolling" some /// suffix of the data to the beginning of the buffer before refilling it with /// more data from the stream. For example, let's say we are trying to match /// "foobar" on a stream. When we report the match, we'd like to not only /// report the correct offsets at which the match occurs, but also the matching /// bytes themselves. So let's say our stream is a file with the following /// contents: `test test foobar test test`. Now assume that we happen to read /// the aforementioned file in two chunks: `test test foo` and `bar test test`. /// Naively, it would not be possible to report a single contiguous `foobar` /// match, but this roll buffer allows us to do that. Namely, after the second /// read, the contents of the buffer should be `st foobar test test`, where the /// search should ultimately resume immediately after `foo`. (The prefix `st ` /// is included because the roll buffer saves N bytes at the end of the buffer, /// where N is the maximum possible length of a match.) /// /// A lot of the logic for dealing with this is unfortunately split out between /// this roll buffer and the `StreamChunkIter`. #[derive(Debug)] pub struct Buffer { /// The raw buffer contents. This has a fixed size and never increases. buf: Vec, /// The minimum size of the buffer, which is equivalent to the maximum /// possible length of a match. This corresponds to the amount that we /// roll min: usize, /// The end of the contents of this buffer. end: usize, } impl Buffer { /// Create a new buffer for stream searching. The minimum buffer length /// given should be the size of the maximum possible match length. pub fn new(min_buffer_len: usize) -> Buffer { let min = cmp::max(1, min_buffer_len); // The minimum buffer amount is also the amount that we roll our // buffer in order to support incremental searching. To this end, // our actual capacity needs to be at least 1 byte bigger than our // minimum amount, otherwise we won't have any overlap. In actuality, // we want our buffer to be a bit bigger than that for performance // reasons, so we set a lower bound of `8 * min`. // // TODO: It would be good to find a way to test the streaming // implementation with the minimal buffer size. let capacity = cmp::max(min * 8, DEFAULT_BUFFER_CAPACITY); Buffer { buf: vec![0; capacity], min, end: 0 } } /// Return the contents of this buffer. #[inline] pub fn buffer(&self) -> &[u8] { &self.buf[..self.end] } /// Return the minimum size of the buffer. The only way a buffer may be /// smaller than this is if the stream itself contains less than the /// minimum buffer amount. #[inline] pub fn min_buffer_len(&self) -> usize { self.min } /// Return the total length of the contents in the buffer. #[inline] pub fn len(&self) -> usize { self.end } /// Return all free capacity in this buffer. fn free_buffer(&mut self) -> &mut [u8] { &mut self.buf[self.end..] } /// Refill the contents of this buffer by reading as much as possible into /// this buffer's free capacity. If no more bytes could be read, then this /// returns false. Otherwise, this reads until it has filled the buffer /// past the minimum amount. pub fn fill(&mut self, mut rdr: R) -> io::Result { let mut readany = false; loop { let readlen = rdr.read(self.free_buffer())?; if readlen == 0 { return Ok(readany); } readany = true; self.end += readlen; if self.len() >= self.min { return Ok(true); } } } /// Roll the contents of the buffer so that the suffix of this buffer is /// moved to the front and all other contents are dropped. The size of the /// suffix corresponds precisely to the minimum buffer length. /// /// This should only be called when the entire contents of this buffer have /// been searched. pub fn roll(&mut self) { let roll_start = self .end .checked_sub(self.min) .expect("buffer capacity should be bigger than minimum amount"); let roll_len = self.min; assert!(roll_start + roll_len <= self.end); unsafe { // SAFETY: A buffer contains Copy data, so there's no problem // moving it around. Safety also depends on our indices being in // bounds, which they always should be, given the assert above. // // TODO: Switch to [T]::copy_within once our MSRV is high enough. ptr::copy( self.buf[roll_start..].as_ptr(), self.buf.as_mut_ptr(), roll_len, ); } self.end = roll_len; } } aho-corasick-0.7.8/src/byte_frequencies.rs010064400017500000144000000105171352131022200167720ustar0000000000000000pub const BYTE_FREQUENCIES: [u8; 256] = [ 55, // '\x00' 52, // '\x01' 51, // '\x02' 50, // '\x03' 49, // '\x04' 48, // '\x05' 47, // '\x06' 46, // '\x07' 45, // '\x08' 103, // '\t' 242, // '\n' 66, // '\x0b' 67, // '\x0c' 229, // '\r' 44, // '\x0e' 43, // '\x0f' 42, // '\x10' 41, // '\x11' 40, // '\x12' 39, // '\x13' 38, // '\x14' 37, // '\x15' 36, // '\x16' 35, // '\x17' 34, // '\x18' 33, // '\x19' 56, // '\x1a' 32, // '\x1b' 31, // '\x1c' 30, // '\x1d' 29, // '\x1e' 28, // '\x1f' 255, // ' ' 148, // '!' 164, // '"' 149, // '#' 136, // '$' 160, // '%' 155, // '&' 173, // "'" 221, // '(' 222, // ')' 134, // '*' 122, // '+' 232, // ',' 202, // '-' 215, // '.' 224, // '/' 208, // '0' 220, // '1' 204, // '2' 187, // '3' 183, // '4' 179, // '5' 177, // '6' 168, // '7' 178, // '8' 200, // '9' 226, // ':' 195, // ';' 154, // '<' 184, // '=' 174, // '>' 126, // '?' 120, // '@' 191, // 'A' 157, // 'B' 194, // 'C' 170, // 'D' 189, // 'E' 162, // 'F' 161, // 'G' 150, // 'H' 193, // 'I' 142, // 'J' 137, // 'K' 171, // 'L' 176, // 'M' 185, // 'N' 167, // 'O' 186, // 'P' 112, // 'Q' 175, // 'R' 192, // 'S' 188, // 'T' 156, // 'U' 140, // 'V' 143, // 'W' 123, // 'X' 133, // 'Y' 128, // 'Z' 147, // '[' 138, // '\\' 146, // ']' 114, // '^' 223, // '_' 151, // '`' 249, // 'a' 216, // 'b' 238, // 'c' 236, // 'd' 253, // 'e' 227, // 'f' 218, // 'g' 230, // 'h' 247, // 'i' 135, // 'j' 180, // 'k' 241, // 'l' 233, // 'm' 246, // 'n' 244, // 'o' 231, // 'p' 139, // 'q' 245, // 'r' 243, // 's' 251, // 't' 235, // 'u' 201, // 'v' 196, // 'w' 240, // 'x' 214, // 'y' 152, // 'z' 182, // '{' 205, // '|' 181, // '}' 127, // '~' 27, // '\x7f' 212, // '\x80' 211, // '\x81' 210, // '\x82' 213, // '\x83' 228, // '\x84' 197, // '\x85' 169, // '\x86' 159, // '\x87' 131, // '\x88' 172, // '\x89' 105, // '\x8a' 80, // '\x8b' 98, // '\x8c' 96, // '\x8d' 97, // '\x8e' 81, // '\x8f' 207, // '\x90' 145, // '\x91' 116, // '\x92' 115, // '\x93' 144, // '\x94' 130, // '\x95' 153, // '\x96' 121, // '\x97' 107, // '\x98' 132, // '\x99' 109, // '\x9a' 110, // '\x9b' 124, // '\x9c' 111, // '\x9d' 82, // '\x9e' 108, // '\x9f' 118, // '\xa0' 141, // '¡' 113, // '¢' 129, // '£' 119, // '¤' 125, // '¥' 165, // '¦' 117, // '§' 92, // '¨' 106, // '©' 83, // 'ª' 72, // '«' 99, // '¬' 93, // '\xad' 65, // '®' 79, // '¯' 166, // '°' 237, // '±' 163, // '²' 199, // '³' 190, // '´' 225, // 'µ' 209, // '¶' 203, // '·' 198, // '¸' 217, // '¹' 219, // 'º' 206, // '»' 234, // '¼' 248, // '½' 158, // '¾' 239, // '¿' 255, // 'À' 255, // 'Á' 255, // 'Â' 255, // 'Ã' 255, // 'Ä' 255, // 'Å' 255, // 'Æ' 255, // 'Ç' 255, // 'È' 255, // 'É' 255, // 'Ê' 255, // 'Ë' 255, // 'Ì' 255, // 'Í' 255, // 'Î' 255, // 'Ï' 255, // 'Ð' 255, // 'Ñ' 255, // 'Ò' 255, // 'Ó' 255, // 'Ô' 255, // 'Õ' 255, // 'Ö' 255, // '×' 255, // 'Ø' 255, // 'Ù' 255, // 'Ú' 255, // 'Û' 255, // 'Ü' 255, // 'Ý' 255, // 'Þ' 255, // 'ß' 255, // 'à' 255, // 'á' 255, // 'â' 255, // 'ã' 255, // 'ä' 255, // 'å' 255, // 'æ' 255, // 'ç' 255, // 'è' 255, // 'é' 255, // 'ê' 255, // 'ë' 255, // 'ì' 255, // 'í' 255, // 'î' 255, // 'ï' 255, // 'ð' 255, // 'ñ' 255, // 'ò' 255, // 'ó' 255, // 'ô' 255, // 'õ' 255, // 'ö' 255, // '÷' 255, // 'ø' 255, // 'ù' 255, // 'ú' 255, // 'û' 255, // 'ü' 255, // 'ý' 255, // 'þ' 255, // 'ÿ' ]; aho-corasick-0.7.8/src/classes.rs010066400017500001731000000201251361627453200151160ustar0000000000000000use std::fmt; /// A representation of byte oriented equivalence classes. /// /// This is used in an FSM to reduce the size of the transition table. This can /// have a particularly large impact not only on the total size of an FSM, but /// also on compile times. #[derive(Clone, Copy)] pub struct ByteClasses([u8; 256]); impl ByteClasses { /// Creates a new set of equivalence classes where all bytes are mapped to /// the same class. pub fn empty() -> ByteClasses { ByteClasses([0; 256]) } /// Creates a new set of equivalence classes where each byte belongs to /// its own equivalence class. pub fn singletons() -> ByteClasses { let mut classes = ByteClasses::empty(); for i in 0..256 { classes.set(i as u8, i as u8); } classes } /// Set the equivalence class for the given byte. #[inline] pub fn set(&mut self, byte: u8, class: u8) { self.0[byte as usize] = class; } /// Get the equivalence class for the given byte. #[inline] pub fn get(&self, byte: u8) -> u8 { // SAFETY: This is safe because all dense transitions have // exactly 256 elements, so all u8 values are valid indices. self.0[byte as usize] } /// Return the total number of elements in the alphabet represented by /// these equivalence classes. Equivalently, this returns the total number /// of equivalence classes. #[inline] pub fn alphabet_len(&self) -> usize { self.0[255] as usize + 1 } /// Returns true if and only if every byte in this class maps to its own /// equivalence class. Equivalently, there are 256 equivalence classes /// and each class contains exactly one byte. #[inline] pub fn is_singleton(&self) -> bool { self.alphabet_len() == 256 } /// Returns an iterator over a sequence of representative bytes from each /// equivalence class. Namely, this yields exactly N items, where N is /// equivalent to the number of equivalence classes. Each item is an /// arbitrary byte drawn from each equivalence class. /// /// This is useful when one is determinizing an NFA and the NFA's alphabet /// hasn't been converted to equivalence classes yet. Picking an arbitrary /// byte from each equivalence class then permits a full exploration of /// the NFA instead of using every possible byte value. pub fn representatives(&self) -> ByteClassRepresentatives { ByteClassRepresentatives { classes: self, byte: 0, last_class: None } } /// Returns all of the bytes in the given equivalence class. /// /// The second element in the tuple indicates the number of elements in /// the array. fn elements(&self, equiv: u8) -> ([u8; 256], usize) { let (mut array, mut len) = ([0; 256], 0); for b in 0..256 { if self.get(b as u8) == equiv { array[len] = b as u8; len += 1; } } (array, len) } } impl fmt::Debug for ByteClasses { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { if self.is_singleton() { write!(f, "ByteClasses({{singletons}})") } else { write!(f, "ByteClasses(")?; for equiv in 0..self.alphabet_len() { let (members, len) = self.elements(equiv as u8); write!(f, " {} => {:?}", equiv, &members[..len])?; } write!(f, ")") } } } /// An iterator over representative bytes from each equivalence class. #[derive(Debug)] pub struct ByteClassRepresentatives<'a> { classes: &'a ByteClasses, byte: usize, last_class: Option, } impl<'a> Iterator for ByteClassRepresentatives<'a> { type Item = u8; fn next(&mut self) -> Option { while self.byte < 256 { let byte = self.byte as u8; let class = self.classes.get(byte); self.byte += 1; if self.last_class != Some(class) { self.last_class = Some(class); return Some(byte); } } None } } /// A byte class builder keeps track of an *approximation* of equivalence /// classes of bytes during NFA construction. That is, every byte in an /// equivalence class cannot discriminate between a match and a non-match. /// /// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w] /// and [{-\xFF] never discriminate between a match and a non-match, precisely /// because they never occur in the literals anywhere. /// /// Note though that this does not necessarily compute the minimal set of /// equivalence classes. For example, in the literals above, the byte ranges /// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence /// classes even though they could be treated a single class. The reason for /// this is implementation complexity. In the future, we should endeavor to /// compute the minimal equivalence classes since they can have a rather large /// impact on the size of the DFA. /// /// The representation here is 256 booleans, all initially set to false. Each /// boolean maps to its corresponding byte based on position. A `true` value /// indicates the end of an equivalence class, where its corresponding byte /// and all of the bytes corresponding to all previous contiguous `false` /// values are in the same equivalence class. /// /// This particular representation only permits contiguous ranges of bytes to /// be in the same equivalence class, which means that we can never discover /// the true minimal set of equivalence classes. #[derive(Debug)] pub struct ByteClassBuilder(Vec); impl ByteClassBuilder { /// Create a new builder of byte classes where all bytes are part of the /// same equivalence class. pub fn new() -> ByteClassBuilder { ByteClassBuilder(vec![false; 256]) } /// Indicate the the range of byte given (inclusive) can discriminate a /// match between it and all other bytes outside of the range. pub fn set_range(&mut self, start: u8, end: u8) { debug_assert!(start <= end); if start > 0 { self.0[start as usize - 1] = true; } self.0[end as usize] = true; } /// Build byte classes that map all byte values to their corresponding /// equivalence class. The last mapping indicates the largest equivalence /// class identifier (which is never bigger than 255). pub fn build(&self) -> ByteClasses { let mut classes = ByteClasses::empty(); let mut class = 0u8; let mut i = 0; loop { classes.set(i as u8, class as u8); if i >= 255 { break; } if self.0[i] { class = class.checked_add(1).unwrap(); } i += 1; } classes } } #[cfg(test)] mod tests { use super::*; #[test] fn byte_classes() { let mut set = ByteClassBuilder::new(); set.set_range(b'a', b'z'); let classes = set.build(); assert_eq!(classes.get(0), 0); assert_eq!(classes.get(1), 0); assert_eq!(classes.get(2), 0); assert_eq!(classes.get(b'a' - 1), 0); assert_eq!(classes.get(b'a'), 1); assert_eq!(classes.get(b'm'), 1); assert_eq!(classes.get(b'z'), 1); assert_eq!(classes.get(b'z' + 1), 2); assert_eq!(classes.get(254), 2); assert_eq!(classes.get(255), 2); let mut set = ByteClassBuilder::new(); set.set_range(0, 2); set.set_range(4, 6); let classes = set.build(); assert_eq!(classes.get(0), 0); assert_eq!(classes.get(1), 0); assert_eq!(classes.get(2), 0); assert_eq!(classes.get(3), 1); assert_eq!(classes.get(4), 2); assert_eq!(classes.get(5), 2); assert_eq!(classes.get(6), 2); assert_eq!(classes.get(7), 3); assert_eq!(classes.get(255), 3); } #[test] fn full_byte_classes() { let mut set = ByteClassBuilder::new(); for i in 0..256u16 { set.set_range(i as u8, i as u8); } assert_eq!(set.build().alphabet_len(), 256); } } aho-corasick-0.7.8/src/dfa.rs010066400017500001731000000505061361627453200142210ustar0000000000000000use std::mem::size_of; use ahocorasick::MatchKind; use automaton::Automaton; use classes::ByteClasses; use error::Result; use nfa::{PatternID, PatternLength, NFA}; use prefilter::{Prefilter, PrefilterObj, PrefilterState}; use state_id::{dead_id, fail_id, premultiply_overflow_error, StateID}; use Match; #[derive(Clone, Debug)] pub enum DFA { Standard(Standard), ByteClass(ByteClass), Premultiplied(Premultiplied), PremultipliedByteClass(PremultipliedByteClass), } impl DFA { fn repr(&self) -> &Repr { match *self { DFA::Standard(ref dfa) => dfa.repr(), DFA::ByteClass(ref dfa) => dfa.repr(), DFA::Premultiplied(ref dfa) => dfa.repr(), DFA::PremultipliedByteClass(ref dfa) => dfa.repr(), } } pub fn match_kind(&self) -> &MatchKind { &self.repr().match_kind } pub fn heap_bytes(&self) -> usize { self.repr().heap_bytes } pub fn max_pattern_len(&self) -> usize { self.repr().max_pattern_len } pub fn pattern_count(&self) -> usize { self.repr().pattern_count } pub fn start_state(&self) -> S { self.repr().start_id } #[inline(always)] pub fn overlapping_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, match_index: &mut usize, ) -> Option { match *self { DFA::Standard(ref dfa) => dfa.overlapping_find_at( prestate, haystack, at, state_id, match_index, ), DFA::ByteClass(ref dfa) => dfa.overlapping_find_at( prestate, haystack, at, state_id, match_index, ), DFA::Premultiplied(ref dfa) => dfa.overlapping_find_at( prestate, haystack, at, state_id, match_index, ), DFA::PremultipliedByteClass(ref dfa) => dfa.overlapping_find_at( prestate, haystack, at, state_id, match_index, ), } } #[inline(always)] pub fn earliest_find_at( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, state_id: &mut S, ) -> Option { match *self { DFA::Standard(ref dfa) => { dfa.earliest_find_at(prestate, haystack, at, state_id) } DFA::ByteClass(ref dfa) => { dfa.earliest_find_at(prestate, haystack, at, state_id) } DFA::Premultiplied(ref dfa) => { dfa.earliest_find_at(prestate, haystack, at, state_id) } DFA::PremultipliedByteClass(ref dfa) => { dfa.earliest_find_at(prestate, haystack, at, state_id) } } } #[inline(always)] pub fn find_at_no_state( &self, prestate: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Option { match *self { DFA::Standard(ref dfa) => { dfa.find_at_no_state(prestate, haystack, at) } DFA::ByteClass(ref dfa) => { dfa.find_at_no_state(prestate, haystack, at) } DFA::Premultiplied(ref dfa) => { dfa.find_at_no_state(prestate, haystack, at) } DFA::PremultipliedByteClass(ref dfa) => { dfa.find_at_no_state(prestate, haystack, at) } } } } #[derive(Clone, Debug)] pub struct Standard(Repr); impl Standard { fn repr(&self) -> &Repr { &self.0 } } impl Automaton for Standard { type ID = S; fn match_kind(&self) -> &MatchKind { &self.repr().match_kind } fn anchored(&self) -> bool { self.repr().anchored } fn prefilter(&self) -> Option<&dyn Prefilter> { self.repr().prefilter.as_ref().map(|p| p.as_ref()) } fn start_state(&self) -> S { self.repr().start_id } fn is_valid(&self, id: S) -> bool { id.to_usize() < self.repr().state_count } fn is_match_state(&self, id: S) -> bool { self.repr().is_match_state(id) } fn is_match_or_dead_state(&self, id: S) -> bool { self.repr().is_match_or_dead_state(id) } fn get_match( &self, id: S, match_index: usize, end: usize, ) -> Option { self.repr().get_match(id, match_index, end) } fn match_count(&self, id: S) -> usize { self.repr().match_count(id) } fn next_state(&self, current: S, input: u8) -> S { let o = current.to_usize() * 256 + input as usize; self.repr().trans[o] } } #[derive(Clone, Debug)] pub struct ByteClass(Repr); impl ByteClass { fn repr(&self) -> &Repr { &self.0 } } impl Automaton for ByteClass { type ID = S; fn match_kind(&self) -> &MatchKind { &self.repr().match_kind } fn anchored(&self) -> bool { self.repr().anchored } fn prefilter(&self) -> Option<&dyn Prefilter> { self.repr().prefilter.as_ref().map(|p| p.as_ref()) } fn start_state(&self) -> S { self.repr().start_id } fn is_valid(&self, id: S) -> bool { id.to_usize() < self.repr().state_count } fn is_match_state(&self, id: S) -> bool { self.repr().is_match_state(id) } fn is_match_or_dead_state(&self, id: S) -> bool { self.repr().is_match_or_dead_state(id) } fn get_match( &self, id: S, match_index: usize, end: usize, ) -> Option { self.repr().get_match(id, match_index, end) } fn match_count(&self, id: S) -> usize { self.repr().match_count(id) } fn next_state(&self, current: S, input: u8) -> S { let alphabet_len = self.repr().byte_classes.alphabet_len(); let input = self.repr().byte_classes.get(input); let o = current.to_usize() * alphabet_len + input as usize; self.repr().trans[o] } } #[derive(Clone, Debug)] pub struct Premultiplied(Repr); impl Premultiplied { fn repr(&self) -> &Repr { &self.0 } } impl Automaton for Premultiplied { type ID = S; fn match_kind(&self) -> &MatchKind { &self.repr().match_kind } fn anchored(&self) -> bool { self.repr().anchored } fn prefilter(&self) -> Option<&dyn Prefilter> { self.repr().prefilter.as_ref().map(|p| p.as_ref()) } fn start_state(&self) -> S { self.repr().start_id } fn is_valid(&self, id: S) -> bool { (id.to_usize() / 256) < self.repr().state_count } fn is_match_state(&self, id: S) -> bool { self.repr().is_match_state(id) } fn is_match_or_dead_state(&self, id: S) -> bool { self.repr().is_match_or_dead_state(id) } fn get_match( &self, id: S, match_index: usize, end: usize, ) -> Option { if id > self.repr().max_match { return None; } self.repr() .matches .get(id.to_usize() / 256) .and_then(|m| m.get(match_index)) .map(|&(id, len)| Match { pattern: id, len, end }) } fn match_count(&self, id: S) -> usize { let o = id.to_usize() / 256; self.repr().matches[o].len() } fn next_state(&self, current: S, input: u8) -> S { let o = current.to_usize() + input as usize; self.repr().trans[o] } } #[derive(Clone, Debug)] pub struct PremultipliedByteClass(Repr); impl PremultipliedByteClass { fn repr(&self) -> &Repr { &self.0 } } impl Automaton for PremultipliedByteClass { type ID = S; fn match_kind(&self) -> &MatchKind { &self.repr().match_kind } fn anchored(&self) -> bool { self.repr().anchored } fn prefilter(&self) -> Option<&dyn Prefilter> { self.repr().prefilter.as_ref().map(|p| p.as_ref()) } fn start_state(&self) -> S { self.repr().start_id } fn is_valid(&self, id: S) -> bool { (id.to_usize() / self.repr().alphabet_len()) < self.repr().state_count } fn is_match_state(&self, id: S) -> bool { self.repr().is_match_state(id) } fn is_match_or_dead_state(&self, id: S) -> bool { self.repr().is_match_or_dead_state(id) } fn get_match( &self, id: S, match_index: usize, end: usize, ) -> Option { if id > self.repr().max_match { return None; } self.repr() .matches .get(id.to_usize() / self.repr().alphabet_len()) .and_then(|m| m.get(match_index)) .map(|&(id, len)| Match { pattern: id, len, end }) } fn match_count(&self, id: S) -> usize { let o = id.to_usize() / self.repr().alphabet_len(); self.repr().matches[o].len() } fn next_state(&self, current: S, input: u8) -> S { let input = self.repr().byte_classes.get(input); let o = current.to_usize() + input as usize; self.repr().trans[o] } } #[derive(Clone, Debug)] pub struct Repr { match_kind: MatchKind, anchored: bool, premultiplied: bool, start_id: S, /// The length, in bytes, of the longest pattern in this automaton. This /// information is useful for keeping correct buffer sizes when searching /// on streams. max_pattern_len: usize, /// The total number of patterns added to this automaton. This includes /// patterns that may never match. pattern_count: usize, state_count: usize, max_match: S, /// The number of bytes of heap used by this NFA's transition table. heap_bytes: usize, /// A prefilter for quickly detecting candidate matchs, if pertinent. prefilter: Option, byte_classes: ByteClasses, trans: Vec, matches: Vec>, } impl Repr { /// Returns the total alphabet size for this DFA. /// /// If byte classes are enabled, then this corresponds to the number of /// equivalence classes. If they are disabled, then this is always 256. fn alphabet_len(&self) -> usize { self.byte_classes.alphabet_len() } /// Returns true only if the given state is a match state. fn is_match_state(&self, id: S) -> bool { id <= self.max_match && id > dead_id() } /// Returns true only if the given state is either a dead state or a match /// state. fn is_match_or_dead_state(&self, id: S) -> bool { id <= self.max_match } /// Get the ith match for the given state, where the end position of a /// match was found at `end`. /// /// # Panics /// /// The caller must ensure that the given state identifier is valid, /// otherwise this may panic. The `match_index` need not be valid. That is, /// if the given state has no matches then this returns `None`. fn get_match( &self, id: S, match_index: usize, end: usize, ) -> Option { if id > self.max_match { return None; } self.matches .get(id.to_usize()) .and_then(|m| m.get(match_index)) .map(|&(id, len)| Match { pattern: id, len, end }) } /// Return the total number of matches for the given state. /// /// # Panics /// /// The caller must ensure that the given identifier is valid, or else /// this panics. fn match_count(&self, id: S) -> usize { self.matches[id.to_usize()].len() } /// Get the next state given `from` as the current state and `byte` as the /// current input byte. fn next_state(&self, from: S, byte: u8) -> S { let alphabet_len = self.alphabet_len(); let byte = self.byte_classes.get(byte); self.trans[from.to_usize() * alphabet_len + byte as usize] } /// Set the `byte` transition for the `from` state to point to `to`. fn set_next_state(&mut self, from: S, byte: u8, to: S) { let alphabet_len = self.alphabet_len(); let byte = self.byte_classes.get(byte); self.trans[from.to_usize() * alphabet_len + byte as usize] = to; } /// Swap the given states in place. fn swap_states(&mut self, id1: S, id2: S) { assert!(!self.premultiplied, "can't swap states in premultiplied DFA"); let o1 = id1.to_usize() * self.alphabet_len(); let o2 = id2.to_usize() * self.alphabet_len(); for b in 0..self.alphabet_len() { self.trans.swap(o1 + b, o2 + b); } self.matches.swap(id1.to_usize(), id2.to_usize()); } /// This routine shuffles all match states in this DFA to the beginning /// of the DFA such that every non-match state appears after every match /// state. (With one exception: the special fail and dead states remain as /// the first two states.) /// /// The purpose of doing this shuffling is to avoid an extra conditional /// in the search loop, and in particular, detecting whether a state is a /// match or not does not need to access any memory. /// /// This updates `self.max_match` to point to the last matching state as /// well as `self.start` if the starting state was moved. fn shuffle_match_states(&mut self) { assert!( !self.premultiplied, "cannot shuffle match states of premultiplied DFA" ); if self.state_count <= 1 { return; } let mut first_non_match = self.start_id.to_usize(); while first_non_match < self.state_count && self.matches[first_non_match].len() > 0 { first_non_match += 1; } let mut swaps: Vec = vec![fail_id(); self.state_count]; let mut cur = self.state_count - 1; while cur > first_non_match { if self.matches[cur].len() > 0 { self.swap_states( S::from_usize(cur), S::from_usize(first_non_match), ); swaps[cur] = S::from_usize(first_non_match); swaps[first_non_match] = S::from_usize(cur); first_non_match += 1; while first_non_match < cur && self.matches[first_non_match].len() > 0 { first_non_match += 1; } } cur -= 1; } for id in (0..self.state_count).map(S::from_usize) { let alphabet_len = self.alphabet_len(); let offset = id.to_usize() * alphabet_len; for next in &mut self.trans[offset..offset + alphabet_len] { if swaps[next.to_usize()] != fail_id() { *next = swaps[next.to_usize()]; } } } if swaps[self.start_id.to_usize()] != fail_id() { self.start_id = swaps[self.start_id.to_usize()]; } self.max_match = S::from_usize(first_non_match - 1); } fn premultiply(&mut self) -> Result<()> { if self.premultiplied || self.state_count <= 1 { return Ok(()); } let alpha_len = self.alphabet_len(); premultiply_overflow_error( S::from_usize(self.state_count - 1), alpha_len, )?; for id in (2..self.state_count).map(S::from_usize) { let offset = id.to_usize() * alpha_len; for next in &mut self.trans[offset..offset + alpha_len] { if *next == dead_id() { continue; } *next = S::from_usize(next.to_usize() * alpha_len); } } self.premultiplied = true; self.start_id = S::from_usize(self.start_id.to_usize() * alpha_len); self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len); Ok(()) } /// Computes the total amount of heap used by this NFA in bytes. fn calculate_size(&mut self) { let mut size = (self.trans.len() * size_of::()) + (self.matches.len() * size_of::>()); for state_matches in &self.matches { size += state_matches.len() * size_of::<(PatternID, PatternLength)>(); } size += self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes()); self.heap_bytes = size; } } /// A builder for configuring the determinization of an NFA into a DFA. #[derive(Clone, Debug)] pub struct Builder { premultiply: bool, byte_classes: bool, } impl Builder { /// Create a new builder for a DFA. pub fn new() -> Builder { Builder { premultiply: true, byte_classes: true } } /// Build a DFA from the given NFA. /// /// This returns an error if the state identifiers exceed their /// representation size. This can only happen when state ids are /// premultiplied (which is enabled by default). pub fn build(&self, nfa: &NFA) -> Result> { let byte_classes = if self.byte_classes { nfa.byte_classes().clone() } else { ByteClasses::singletons() }; let alphabet_len = byte_classes.alphabet_len(); let trans = vec![fail_id(); alphabet_len * nfa.state_len()]; let matches = vec![vec![]; nfa.state_len()]; let mut repr = Repr { match_kind: nfa.match_kind().clone(), anchored: nfa.anchored(), premultiplied: false, start_id: nfa.start_state(), max_pattern_len: nfa.max_pattern_len(), pattern_count: nfa.pattern_count(), state_count: nfa.state_len(), max_match: fail_id(), heap_bytes: 0, prefilter: nfa.prefilter_obj().map(|p| p.clone()), byte_classes: byte_classes.clone(), trans: trans, matches: matches, }; for id in (0..nfa.state_len()).map(S::from_usize) { repr.matches[id.to_usize()].extend_from_slice(nfa.matches(id)); let fail = nfa.failure_transition(id); nfa.iter_all_transitions(&byte_classes, id, |b, mut next| { if next == fail_id() { next = nfa_next_state_memoized(nfa, &repr, id, fail, b); } repr.set_next_state(id, b, next); }); } repr.shuffle_match_states(); repr.calculate_size(); if self.premultiply { repr.premultiply()?; if byte_classes.is_singleton() { Ok(DFA::Premultiplied(Premultiplied(repr))) } else { Ok(DFA::PremultipliedByteClass(PremultipliedByteClass(repr))) } } else { if byte_classes.is_singleton() { Ok(DFA::Standard(Standard(repr))) } else { Ok(DFA::ByteClass(ByteClass(repr))) } } } /// Whether to use byte classes or in the DFA. pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { self.byte_classes = yes; self } /// Whether to premultiply state identifier in the DFA. pub fn premultiply(&mut self, yes: bool) -> &mut Builder { self.premultiply = yes; self } } /// This returns the next NFA transition (including resolving failure /// transitions), except once it sees a state id less than the id of the DFA /// state that is currently being populated, then we no longer need to follow /// failure transitions and can instead query the pre-computed state id from /// the DFA itself. /// /// In general, this should only be called when a failure transition is seen. fn nfa_next_state_memoized( nfa: &NFA, dfa: &Repr, populating: S, mut current: S, input: u8, ) -> S { loop { if current < populating { return dfa.next_state(current, input); } let next = nfa.next_state(current, input); if next != fail_id() { return next; } current = nfa.failure_transition(current); } } aho-corasick-0.7.8/src/error.rs010064400017500000144000000064411352131022200145700ustar0000000000000000use std::error; use std::fmt; use std::result; pub type Result = result::Result; /// An error that occurred during the construction of an Aho-Corasick /// automaton. #[derive(Clone, Debug)] pub struct Error { kind: ErrorKind, } /// The kind of error that occurred. #[derive(Clone, Debug)] pub enum ErrorKind { /// An error that occurs when constructing an automaton would require the /// use of a state ID that overflows the chosen state ID representation. /// For example, if one is using `u8` for state IDs and builds a DFA with /// 257 states, then the last state's ID will be `256` which cannot be /// represented with `u8`. StateIDOverflow { /// The maximum possible state ID. max: usize, }, /// An error that occurs when premultiplication of state IDs is requested /// when constructing an Aho-Corasick DFA, but doing so would overflow the /// chosen state ID representation. /// /// When `max == requested_max`, then the state ID would overflow `usize`. PremultiplyOverflow { /// The maximum possible state id. max: usize, /// The maximum ID required by premultiplication. requested_max: usize, }, } impl Error { /// Return the kind of this error. pub fn kind(&self) -> &ErrorKind { &self.kind } pub(crate) fn state_id_overflow(max: usize) -> Error { Error { kind: ErrorKind::StateIDOverflow { max } } } pub(crate) fn premultiply_overflow( max: usize, requested_max: usize, ) -> Error { Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } } } } impl error::Error for Error { fn description(&self) -> &str { match self.kind { ErrorKind::StateIDOverflow { .. } => { "state id representation too small" } ErrorKind::PremultiplyOverflow { .. } => { "state id representation too small for premultiplication" } } } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.kind { ErrorKind::StateIDOverflow { max } => write!( f, "building the automaton failed because it required \ building more states that can be identified, where the \ maximum ID for the chosen representation is {}", max, ), ErrorKind::PremultiplyOverflow { max, requested_max } => { if max == requested_max { write!( f, "premultiplication of states requires the ability to \ represent a state ID greater than what can fit on \ this platform's usize, which is {}", ::std::usize::MAX, ) } else { write!( f, "premultiplication of states requires the ability to \ represent at least a state ID of {}, but the chosen \ representation only permits a maximum state ID of {}", requested_max, max, ) } } } } } aho-corasick-0.7.8/src/lib.rs010064400017500000144000000227771352131022200142170ustar0000000000000000/*! A library for finding occurrences of many patterns at once. This library provides multiple pattern search principally through an implementation of the [Aho-Corasick algorithm](https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm), which builds a fast finite state machine for executing searches in linear time. Additionally, this library provides a number of configuration options for building the automaton that permit controlling the space versus time trade off. Other features include simple ASCII case insensitive matching, finding overlapping matches, replacements, searching streams and even searching and replacing text in streams. Finally, unlike all other (known) Aho-Corasick implementations, this one supports enabling [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst) or [leftmost-longest](enum.MatchKind.html#variant.LeftmostFirst) match semantics, using a (seemingly) novel alternative construction algorithm. For more details on what match semantics means, see the [`MatchKind`](enum.MatchKind.html) type. # Overview This section gives a brief overview of the primary types in this crate: * [`AhoCorasick`](struct.AhoCorasick.html) is the primary type and represents an Aho-Corasick automaton. This is the type you use to execute searches. * [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html) can be used to build an Aho-Corasick automaton, and supports configuring a number of options. * [`Match`](struct.Match.html) represents a single match reported by an Aho-Corasick automaton. Each match has two pieces of information: the pattern that matched and the start and end byte offsets corresponding to the position in the haystack at which it matched. Additionally, the [`packed`](packed/index.html) sub-module contains a lower level API for using fast vectorized routines for finding a small number of patterns in a haystack. # Example: basic searching This example shows how to search for occurrences of multiple patterns simultaneously. Each match includes the pattern that matched along with the byte offsets of the match. ``` use aho_corasick::AhoCorasick; let patterns = &["apple", "maple", "Snapple"]; let haystack = "Nobody likes maple in their apple flavored Snapple."; let ac = AhoCorasick::new(patterns); let mut matches = vec![]; for mat in ac.find_iter(haystack) { matches.push((mat.pattern(), mat.start(), mat.end())); } assert_eq!(matches, vec![ (1, 13, 18), (0, 28, 33), (2, 43, 50), ]); ``` # Example: case insensitivity This is like the previous example, but matches `Snapple` case insensitively using `AhoCorasickBuilder`: ``` use aho_corasick::AhoCorasickBuilder; let patterns = &["apple", "maple", "snapple"]; let haystack = "Nobody likes maple in their apple flavored Snapple."; let ac = AhoCorasickBuilder::new() .ascii_case_insensitive(true) .build(patterns); let mut matches = vec![]; for mat in ac.find_iter(haystack) { matches.push((mat.pattern(), mat.start(), mat.end())); } assert_eq!(matches, vec![ (1, 13, 18), (0, 28, 33), (2, 43, 50), ]); ``` # Example: replacing matches in a stream This example shows how to execute a search and replace on a stream without loading the entire stream into memory first. ``` use aho_corasick::AhoCorasick; # fn example() -> Result<(), ::std::io::Error> { let patterns = &["fox", "brown", "quick"]; let replace_with = &["sloth", "grey", "slow"]; // In a real example, these might be `std::fs::File`s instead. All you need to // do is supply a pair of `std::io::Read` and `std::io::Write` implementations. let rdr = "The quick brown fox."; let mut wtr = vec![]; let ac = AhoCorasick::new(patterns); ac.stream_replace_all(rdr.as_bytes(), &mut wtr, replace_with)?; assert_eq!(b"The slow grey sloth.".to_vec(), wtr); # Ok(()) }; example().unwrap() ``` # Example: finding the leftmost first match In the textbook description of Aho-Corasick, its formulation is typically structured such that it reports all possible matches, even when they overlap with another. In many cases, overlapping matches may not be desired, such as the case of finding all successive non-overlapping matches like you might with a standard regular expression. Unfortunately the "obvious" way to modify the Aho-Corasick algorithm to do this doesn't always work in the expected way, since it will report matches as soon as they are seen. For example, consider matching the regex `Samwise|Sam` against the text `Samwise`. Most regex engines (that are Perl-like, or non-POSIX) will report `Samwise` as a match, but the standard Aho-Corasick algorithm modified for reporting non-overlapping matches will report `Sam`. A novel contribution of this library is the ability to change the match semantics of Aho-Corasick (without additional search time overhead) such that `Samwise` is reported instead. For example, here's the standard approach: ``` use aho_corasick::AhoCorasick; let patterns = &["Samwise", "Sam"]; let haystack = "Samwise"; let ac = AhoCorasick::new(patterns); let mat = ac.find(haystack).expect("should have a match"); assert_eq!("Sam", &haystack[mat.start()..mat.end()]); ``` And now here's the leftmost-first version, which matches how a Perl-like regex will work: ``` use aho_corasick::{AhoCorasickBuilder, MatchKind}; let patterns = &["Samwise", "Sam"]; let haystack = "Samwise"; let ac = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostFirst) .build(patterns); let mat = ac.find(haystack).expect("should have a match"); assert_eq!("Samwise", &haystack[mat.start()..mat.end()]); ``` In addition to leftmost-first semantics, this library also supports leftmost-longest semantics, which match the POSIX behavior of a regular expression alternation. See [`MatchKind`](enum.MatchKind.html) for more details. # Prefilters While an Aho-Corasick automaton can perform admirably when compared to more naive solutions, it is generally slower than more specialized algorithms that are accelerated using vector instructions such as SIMD. For that reason, this library will internally use a "prefilter" to attempt to accelerate searches when possible. Currently, this library has fairly limited implementation that only applies when there are 3 or fewer unique starting bytes among all patterns in an automaton. While a prefilter is generally good to have on by default since it works well in the common case, it can lead to less predictable or even sub-optimal performance in some cases. For that reason, prefilters can be disabled via [`AhoCorasickBuilder::prefilter`](struct.AhoCorasickBuilder.html#method.prefilter). */ #![deny(missing_docs)] // We can never be truly no_std, but we could be alloc-only some day, so // require the std feature for now. #[cfg(not(feature = "std"))] compile_error!("`std` feature is currently required to build this crate"); extern crate memchr; #[cfg(test)] #[macro_use] extern crate doc_comment; #[cfg(test)] doctest!("../README.md"); pub use ahocorasick::{ AhoCorasick, AhoCorasickBuilder, FindIter, FindOverlappingIter, MatchKind, StreamFindIter, }; pub use error::{Error, ErrorKind}; pub use state_id::StateID; mod ahocorasick; mod automaton; mod buffer; mod byte_frequencies; mod classes; mod dfa; mod error; mod nfa; pub mod packed; mod prefilter; mod state_id; #[cfg(test)] mod tests; /// A representation of a match reported by an Aho-Corasick automaton. /// /// A match has two essential pieces of information: the identifier of the /// pattern that matched, along with the start and end offsets of the match /// in the haystack. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::AhoCorasick; /// /// let ac = AhoCorasick::new(&[ /// "foo", "bar", "baz", /// ]); /// let mat = ac.find("xxx bar xxx").expect("should have a match"); /// assert_eq!(1, mat.pattern()); /// assert_eq!(4, mat.start()); /// assert_eq!(7, mat.end()); /// ``` #[derive(Clone, Debug, Eq, Hash, PartialEq)] pub struct Match { /// The pattern id. pattern: usize, /// The length of this match, such that the starting position of the match /// is `end - len`. /// /// We use length here because, other than the pattern id, the only /// information about each pattern that the automaton stores is its length. /// So using the length here is just a bit more natural. But it isn't /// technically required. len: usize, /// The end offset of the match, exclusive. end: usize, } impl Match { /// Returns the identifier of the pattern that matched. /// /// The identifier of a pattern is derived from the position in which it /// was originally inserted into the corresponding automaton. The first /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` /// and so on. #[inline] pub fn pattern(&self) -> usize { self.pattern } /// The starting position of the match. #[inline] pub fn start(&self) -> usize { self.end - self.len } /// The ending position of the match. #[inline] pub fn end(&self) -> usize { self.end } /// Returns true if and only if this match is empty. That is, when /// `start() == end()`. /// /// An empty match can only be returned when the empty string was among /// the patterns used to build the Aho-Corasick automaton. #[inline] pub fn is_empty(&self) -> bool { self.len == 0 } #[inline] fn increment(&self, by: usize) -> Match { Match { pattern: self.pattern, len: self.len, end: self.end + by } } #[inline] fn from_span(id: usize, start: usize, end: usize) -> Match { Match { pattern: id, len: end - start, end: end } } } aho-corasick-0.7.8/src/nfa.rs010066400017500001731000001510301361627453200142250ustar0000000000000000use std::cmp; use std::collections::{BTreeSet, VecDeque}; use std::fmt; use std::mem::size_of; use std::ops::{Index, IndexMut}; use ahocorasick::MatchKind; use automaton::Automaton; use classes::{ByteClassBuilder, ByteClasses}; use error::Result; use prefilter::{self, opposite_ascii_case, Prefilter, PrefilterObj}; use state_id::{dead_id, fail_id, usize_to_state_id, StateID}; use Match; /// The identifier for a pattern, which is simply the position of the pattern /// in the sequence of patterns given by the caller. pub type PatternID = usize; /// The length of a pattern, in bytes. pub type PatternLength = usize; /// An Aho-Corasick automaton, represented as an NFA. /// /// This is the classical formulation of Aho-Corasick, which involves building /// up a prefix trie of a given set of patterns, and then wiring up failure /// transitions between states in order to guarantee linear time matching. The /// standard formulation is, technically, an NFA because of these failure /// transitions. That is, one can see them as enabling the automaton to be in /// multiple states at once. Indeed, during search, it is possible to check /// the transitions on multiple states for a single input byte. /// /// This particular implementation not only supports the standard style of /// matching, but also provides a mode for choosing leftmost-first or /// leftmost-longest match semantics. When a leftmost mode is chosen, some /// failure transitions that would otherwise be added are elided. See /// the documentation of `MatchKind` for more details and examples on how the /// match semantics may differ. /// /// If one wants a DFA, then it is necessary to first build an NFA and convert /// it into a DFA. Note, however, that because we've constrained ourselves to /// matching literal patterns, this does not need to use subset construction /// for determinization. Instead, the DFA has at most a number of states /// equivalent to the number of NFA states. The only real difference between /// them is that all failure transitions are followed and pre-computed. This /// uses much more memory, but also executes searches more quickly. #[derive(Clone)] pub struct NFA { /// The match semantics built into this NFA. match_kind: MatchKind, /// The start state id as an index into `states`. start_id: S, /// The length, in bytes, of the longest pattern in this automaton. This /// information is useful for keeping correct buffer sizes when searching /// on streams. max_pattern_len: usize, /// The total number of patterns added to this automaton, including /// patterns that may never be matched. pattern_count: usize, /// The number of bytes of heap used by this NFA's transition table. heap_bytes: usize, /// A prefilter for quickly skipping to candidate matches, if pertinent. prefilter: Option, /// Whether this automaton anchors all matches to the start of input. anchored: bool, /// A set of equivalence classes in terms of bytes. We compute this while /// building the NFA, but don't use it in the NFA's states. Instead, we /// use this for building the DFA. We store it on the NFA since it's easy /// to compute while visiting the the patterns. byte_classes: ByteClasses, /// A set of states. Each state defines its own transitions, a fail /// transition and a set of indices corresponding to matches. /// /// The first state is always the fail state, which is used only as a /// sentinel. Namely, in the final NFA, no transition into the fail state /// exists. (Well, they do, but they aren't followed. Instead, the state's /// failure transition is followed.) /// /// The second state (index 1) is always the dead state. Dead states are /// in every automaton, but only used when leftmost-{first,longest} match /// semantics are enabled. Specifically, they instruct search to stop /// at specific points in order to report the correct match location. In /// the standard Aho-Corasick construction, there are no transitions to /// the dead state. /// /// The third state (index 2) is generally intended to be the starting or /// "root" state. states: Vec>, } impl NFA { /// Returns the equivalence classes of bytes found while constructing /// this NFA. /// /// Note that the NFA doesn't actually make use of these equivalence /// classes. Instead, these are useful for building the DFA when desired. pub fn byte_classes(&self) -> &ByteClasses { &self.byte_classes } /// Returns a prefilter, if one exists. pub fn prefilter_obj(&self) -> Option<&PrefilterObj> { self.prefilter.as_ref() } /// Returns the total number of heap bytes used by this NFA's transition /// table. pub fn heap_bytes(&self) -> usize { self.heap_bytes + self.prefilter.as_ref().map_or(0, |p| p.as_ref().heap_bytes()) } /// Return the length of the longest pattern in this automaton. pub fn max_pattern_len(&self) -> usize { self.max_pattern_len } /// Return the total number of patterns added to this automaton. pub fn pattern_count(&self) -> usize { self.pattern_count } /// Returns the total number of states in this NFA. pub fn state_len(&self) -> usize { self.states.len() } /// Returns the matches for the given state. pub fn matches(&self, id: S) -> &[(PatternID, PatternLength)] { &self.states[id.to_usize()].matches } /// Returns an iterator over all transitions in the given state according /// to the given equivalence classes, including transitions to `fail_id()`. /// The number of transitions returned is always equivalent to the number /// of equivalence classes. pub fn iter_all_transitions( &self, byte_classes: &ByteClasses, id: S, f: F, ) { self.states[id.to_usize()].trans.iter_all(byte_classes, f); } /// Returns the failure transition for the given state. pub fn failure_transition(&self, id: S) -> S { self.states[id.to_usize()].fail } /// Returns the next state for the given state and input byte. /// /// Note that this does not follow failure transitions. As such, the id /// returned may be `fail_id`. pub fn next_state(&self, current: S, input: u8) -> S { self.states[current.to_usize()].next_state(input) } fn state(&self, id: S) -> &State { &self.states[id.to_usize()] } fn state_mut(&mut self, id: S) -> &mut State { &mut self.states[id.to_usize()] } fn start(&self) -> &State { self.state(self.start_id) } fn start_mut(&mut self) -> &mut State { let id = self.start_id; self.state_mut(id) } fn iter_transitions_mut(&mut self, id: S) -> IterTransitionsMut { IterTransitionsMut::new(self, id) } fn copy_matches(&mut self, src: S, dst: S) { let (src, dst) = get_two_mut(&mut self.states, src.to_usize(), dst.to_usize()); dst.matches.extend_from_slice(&src.matches); } fn copy_empty_matches(&mut self, dst: S) { let start_id = self.start_id; self.copy_matches(start_id, dst); } fn add_dense_state(&mut self, depth: usize) -> Result { let trans = Transitions::Dense(Dense::new()); let id = usize_to_state_id(self.states.len())?; self.states.push(State { trans, // Anchored automatons do not have any failure transitions. fail: if self.anchored { dead_id() } else { self.start_id }, depth: depth, matches: vec![], }); Ok(id) } fn add_sparse_state(&mut self, depth: usize) -> Result { let trans = Transitions::Sparse(vec![]); let id = usize_to_state_id(self.states.len())?; self.states.push(State { trans, // Anchored automatons do not have any failure transitions. fail: if self.anchored { dead_id() } else { self.start_id }, depth: depth, matches: vec![], }); Ok(id) } } impl Automaton for NFA { type ID = S; fn match_kind(&self) -> &MatchKind { &self.match_kind } fn anchored(&self) -> bool { self.anchored } fn prefilter(&self) -> Option<&dyn Prefilter> { self.prefilter.as_ref().map(|p| p.as_ref()) } fn start_state(&self) -> S { self.start_id } fn is_valid(&self, id: S) -> bool { id.to_usize() < self.states.len() } fn is_match_state(&self, id: S) -> bool { self.states[id.to_usize()].is_match() } fn get_match( &self, id: S, match_index: usize, end: usize, ) -> Option { let state = match self.states.get(id.to_usize()) { None => return None, Some(state) => state, }; state.matches.get(match_index).map(|&(id, len)| Match { pattern: id, len, end, }) } fn match_count(&self, id: S) -> usize { self.states[id.to_usize()].matches.len() } fn next_state(&self, mut current: S, input: u8) -> S { // This terminates since: // // 1. `State.fail` never points to fail_id(). // 2. All `State.fail` values point to a state closer to `start`. // 3. The start state has no transitions to fail_id(). loop { let state = &self.states[current.to_usize()]; let next = state.next_state(input); if next != fail_id() { return next; } current = state.fail; } } } /// A representation of an NFA state for an Aho-Corasick automaton. /// /// It contains the transitions to the next state, a failure transition for /// cases where there exists no other transition for the current input byte, /// the matches implied by visiting this state (if any) and the depth of this /// state. The depth of a state is simply the distance from it to the start /// state in the automaton, where the depth of the start state is 0. #[derive(Clone, Debug)] pub struct State { trans: Transitions, fail: S, matches: Vec<(PatternID, PatternLength)>, // TODO: Strictly speaking, this isn't needed for searching. It's only // used when building an NFA that supports leftmost match semantics. We // could drop this from the state and dynamically build a map only when // computing failure transitions, but it's not clear which is better. // Benchmark this. depth: usize, } impl State { fn heap_bytes(&self) -> usize { self.trans.heap_bytes() + (self.matches.len() * size_of::<(PatternID, PatternLength)>()) } fn add_match(&mut self, i: PatternID, len: PatternLength) { self.matches.push((i, len)); } fn is_match(&self) -> bool { !self.matches.is_empty() } fn get_longest_match_len(&self) -> Option { // Why is this true? Because the first match in any matching state // will always correspond to the match added to it during trie // construction (since when we copy matches due to failure transitions, // we always append them). Therefore, it follows that the first match // must always be longest since any subsequent match must be from a // failure transition, and a failure transition by construction points // to a proper suffix. A proper suffix is, by definition, smaller. self.matches.get(0).map(|&(_, len)| len) } fn next_state(&self, input: u8) -> S { self.trans.next_state(input) } fn set_next_state(&mut self, input: u8, next: S) { self.trans.set_next_state(input, next); } } /// Represents the transitions for a single dense state. /// /// The primary purpose here is to encapsulate index access. Namely, since a /// dense representation always contains 256 elements, all values of `u8` are /// valid indices. #[derive(Clone, Debug)] struct Dense(Vec); impl Dense where S: StateID, { fn new() -> Self { Dense(vec![fail_id(); 256]) } #[inline] fn len(&self) -> usize { self.0.len() } } impl Index for Dense { type Output = S; #[inline] fn index(&self, i: u8) -> &S { // SAFETY: This is safe because all dense transitions have // exactly 256 elements, so all u8 values are valid indices. &self.0[i as usize] } } impl IndexMut for Dense { #[inline] fn index_mut(&mut self, i: u8) -> &mut S { // SAFETY: This is safe because all dense transitions have // exactly 256 elements, so all u8 values are valid indices. &mut self.0[i as usize] } } /// A representation of transitions in an NFA. /// /// Transitions have either a sparse representation, which is slower for /// lookups but uses less memory, or a dense representation, which is faster /// for lookups but uses more memory. In the sparse representation, the absence /// of a state implies a transition to `fail_id()`. Transitions to `dead_id()` /// are still explicitly represented. /// /// For the NFA, by default, we use a dense representation for transitions for /// states close to the start state because it's likely these are the states /// that will be most frequently visited. #[derive(Clone, Debug)] enum Transitions { Sparse(Vec<(u8, S)>), Dense(Dense), } impl Transitions { fn heap_bytes(&self) -> usize { match *self { Transitions::Sparse(ref sparse) => { sparse.len() * size_of::<(u8, S)>() } Transitions::Dense(ref dense) => dense.len() * size_of::(), } } fn next_state(&self, input: u8) -> S { match *self { Transitions::Sparse(ref sparse) => { for &(b, id) in sparse { if b == input { return id; } } fail_id() } Transitions::Dense(ref dense) => dense[input], } } fn set_next_state(&mut self, input: u8, next: S) { match *self { Transitions::Sparse(ref mut sparse) => { match sparse.binary_search_by_key(&input, |&(b, _)| b) { Ok(i) => sparse[i] = (input, next), Err(i) => sparse.insert(i, (input, next)), } } Transitions::Dense(ref mut dense) => { dense[input] = next; } } } /// Iterate over transitions in this state while skipping over transitions /// to `fail_id()`. fn iter(&self, mut f: F) { match *self { Transitions::Sparse(ref sparse) => { for &(b, id) in sparse { f(b, id); } } Transitions::Dense(ref dense) => { for b in AllBytesIter::new() { let id = dense[b]; if id != fail_id() { f(b, id); } } } } } /// Iterate over all transitions in this state according to the given /// equivalence classes, including transitions to `fail_id()`. fn iter_all(&self, classes: &ByteClasses, mut f: F) { if classes.is_singleton() { match *self { Transitions::Sparse(ref sparse) => { sparse_iter(sparse, f); } Transitions::Dense(ref dense) => { for b in AllBytesIter::new() { f(b, dense[b]); } } } } else { // In this case, we only want to yield a single byte for each // equivalence class. match *self { Transitions::Sparse(ref sparse) => { let mut last_class = None; sparse_iter(sparse, |b, next| { let class = classes.get(b); if last_class != Some(class) { last_class = Some(class); f(b, next); } }) } Transitions::Dense(ref dense) => { for b in classes.representatives() { f(b, dense[b]); } } } } } } /// Iterator over transitions in a state, skipping transitions to `fail_id()`. /// /// This abstracts over the representation of NFA transitions, which may be /// either in a sparse or dense representation. /// /// This somewhat idiosyncratically borrows the NFA mutably, so that when one /// is iterating over transitions, the caller can still mutate the NFA. This /// is useful when creating failure transitions. #[derive(Debug)] struct IterTransitionsMut<'a, S: StateID + 'a> { nfa: &'a mut NFA, state_id: S, cur: usize, } impl<'a, S: StateID> IterTransitionsMut<'a, S> { fn new(nfa: &'a mut NFA, state_id: S) -> IterTransitionsMut<'a, S> { IterTransitionsMut { nfa, state_id, cur: 0 } } fn nfa(&mut self) -> &mut NFA { self.nfa } } impl<'a, S: StateID> Iterator for IterTransitionsMut<'a, S> { type Item = (u8, S); fn next(&mut self) -> Option<(u8, S)> { match self.nfa.states[self.state_id.to_usize()].trans { Transitions::Sparse(ref sparse) => { if self.cur >= sparse.len() { return None; } let i = self.cur; self.cur += 1; Some(sparse[i]) } Transitions::Dense(ref dense) => { while self.cur < dense.len() { // There are always exactly 255 transitions in dense repr. debug_assert!(self.cur < 256); let b = self.cur as u8; let id = dense[b]; self.cur += 1; if id != fail_id() { return Some((b, id)); } } None } } } } /// A simple builder for configuring the NFA construction of Aho-Corasick. #[derive(Clone, Debug)] pub struct Builder { dense_depth: usize, match_kind: MatchKind, prefilter: bool, anchored: bool, ascii_case_insensitive: bool, } impl Default for Builder { fn default() -> Builder { Builder { dense_depth: 2, match_kind: MatchKind::default(), prefilter: true, anchored: false, ascii_case_insensitive: false, } } } impl Builder { pub fn new() -> Builder { Builder::default() } pub fn build(&self, patterns: I) -> Result> where I: IntoIterator, P: AsRef<[u8]>, { Compiler::new(self)?.compile(patterns) } pub fn match_kind(&mut self, kind: MatchKind) -> &mut Builder { self.match_kind = kind; self } pub fn dense_depth(&mut self, depth: usize) -> &mut Builder { self.dense_depth = depth; self } pub fn prefilter(&mut self, yes: bool) -> &mut Builder { self.prefilter = yes; self } pub fn anchored(&mut self, yes: bool) -> &mut Builder { self.anchored = yes; self } pub fn ascii_case_insensitive(&mut self, yes: bool) -> &mut Builder { self.ascii_case_insensitive = yes; self } } /// A compiler uses a builder configuration and builds up the NFA formulation /// of an Aho-Corasick automaton. This roughly corresponds to the standard /// formulation described in textbooks. #[derive(Debug)] struct Compiler<'a, S: StateID> { builder: &'a Builder, prefilter: prefilter::Builder, nfa: NFA, byte_classes: ByteClassBuilder, } impl<'a, S: StateID> Compiler<'a, S> { fn new(builder: &'a Builder) -> Result> { Ok(Compiler { builder: builder, prefilter: prefilter::Builder::new(builder.match_kind) .ascii_case_insensitive(builder.ascii_case_insensitive), nfa: NFA { match_kind: builder.match_kind, start_id: usize_to_state_id(2)?, max_pattern_len: 0, pattern_count: 0, heap_bytes: 0, prefilter: None, anchored: builder.anchored, byte_classes: ByteClasses::singletons(), states: vec![], }, byte_classes: ByteClassBuilder::new(), }) } fn compile(mut self, patterns: I) -> Result> where I: IntoIterator, P: AsRef<[u8]>, { self.add_state(0)?; // the fail state, which is never entered self.add_state(0)?; // the dead state, only used for leftmost self.add_state(0)?; // the start state self.build_trie(patterns)?; self.add_start_state_loop(); self.add_dead_state_loop(); if !self.builder.anchored { if self.match_kind().is_leftmost() { self.fill_failure_transitions_leftmost(); } else { self.fill_failure_transitions_standard(); } } self.close_start_state_loop(); self.nfa.byte_classes = self.byte_classes.build(); if !self.builder.anchored { self.nfa.prefilter = self.prefilter.build(); } self.calculate_size(); Ok(self.nfa) } /// This sets up the initial prefix trie that makes up the Aho-Corasick /// automaton. Effectively, it creates the basic structure of the /// automaton, where every pattern given has a path from the start state to /// the end of the pattern. fn build_trie(&mut self, patterns: I) -> Result<()> where I: IntoIterator, P: AsRef<[u8]>, { 'PATTERNS: for (pati, pat) in patterns.into_iter().enumerate() { let pat = pat.as_ref(); self.nfa.max_pattern_len = cmp::max(self.nfa.max_pattern_len, pat.len()); self.nfa.pattern_count += 1; let mut prev = self.nfa.start_id; let mut saw_match = false; for (depth, &b) in pat.iter().enumerate() { // When leftmost-first match semantics are requested, we // specifically stop adding patterns when a previously added // pattern is a prefix of it. We avoid adding it because // leftmost-first semantics imply that the pattern can never // match. This is not just an optimization to save space! It // is necessary for correctness. In fact, this is the only // difference in the automaton between the implementations for // leftmost-first and leftmost-longest. saw_match = saw_match || self.nfa.state(prev).is_match(); if self.builder.match_kind.is_leftmost_first() && saw_match { // Skip to the next pattern immediately. This avoids // incorrectly adding a match after this loop terminates. continue 'PATTERNS; } // Add this byte to our equivalence classes. We don't use these // for NFA construction. These are instead used only if we're // building a DFA. They would technically be useful for the // NFA, but it would require a second pass over the patterns. self.byte_classes.set_range(b, b); if self.builder.ascii_case_insensitive { let b = opposite_ascii_case(b); self.byte_classes.set_range(b, b); } // If the transition from prev using the current byte already // exists, then just move through it. Otherwise, add a new // state. We track the depth here so that we can determine // how to represent transitions. States near the start state // use a dense representation that uses more memory but is // faster. Other states use a sparse representation that uses // less memory but is slower. let next = self.nfa.state(prev).next_state(b); if next != fail_id() { prev = next; } else { let next = self.add_state(depth + 1)?; self.nfa.state_mut(prev).set_next_state(b, next); if self.builder.ascii_case_insensitive { let b = opposite_ascii_case(b); self.nfa.state_mut(prev).set_next_state(b, next); } prev = next; } } // Once the pattern has been added, log the match in the final // state that it reached. self.nfa.state_mut(prev).add_match(pati, pat.len()); // ... and hand it to the prefilter builder, if applicable. if self.builder.prefilter { self.prefilter.add(pat); } } Ok(()) } /// This routine creates failure transitions according to the standard /// textbook formulation of the Aho-Corasick algorithm. /// /// Building failure transitions is the most interesting part of building /// the Aho-Corasick automaton, because they are what allow searches to /// be performed in linear time. Specifically, a failure transition is /// a single transition associated with each state that points back to /// the longest proper suffix of the pattern being searched. The failure /// transition is followed whenever there exists no transition on the /// current state for the current input byte. If there is no other proper /// suffix, then the failure transition points back to the starting state. /// /// For example, let's say we built an Aho-Corasick automaton with the /// following patterns: 'abcd' and 'cef'. The trie looks like this: /// /// ```ignore /// a - S1 - b - S2 - c - S3 - d - S4* /// / /// S0 - c - S5 - e - S6 - f - S7* /// ``` /// /// At this point, it should be fairly straight-forward to see how this /// trie can be used in a simplistic way. At any given position in the /// text we're searching (called the "subject" string), all we need to do /// is follow the transitions in the trie by consuming one transition for /// each byte in the subject string. If we reach a match state, then we can /// report that location as a match. /// /// The trick comes when searching a subject string like 'abcef'. We'll /// initially follow the transition from S0 to S1 and wind up in S3 after /// observng the 'c' byte. At this point, the next byte is 'e' but state /// S3 has no transition for 'e', so the search fails. We then would need /// to restart the search at the next position in 'abcef', which /// corresponds to 'b'. The match would fail, but the next search starting /// at 'c' would finally succeed. The problem with this approach is that /// we wind up searching the subject string potentially many times. In /// effect, this makes the algorithm have worst case `O(n * m)` complexity, /// where `n ~ len(subject)` and `m ~ len(all patterns)`. We would instead /// like to achieve a `O(n + m)` worst case complexity. /// /// This is where failure transitions come in. Instead of dying at S3 in /// the first search, the automaton can instruct the search to move to /// another part of the automaton that corresponds to a suffix of what /// we've seen so far. Recall that we've seen 'abc' in the subject string, /// and the automaton does indeed have a non-empty suffix, 'c', that could /// potentially lead to another match. Thus, the actual Aho-Corasick /// automaton for our patterns in this case looks like this: /// /// ```ignore /// a - S1 - b - S2 - c - S3 - d - S4* /// / / /// / ---------------- /// / / /// S0 - c - S5 - e - S6 - f - S7* /// ``` /// /// That is, we have a failure transition from S3 to S5, which is followed /// exactly in cases when we are in state S3 but see any byte other than /// 'd' (that is, we've "failed" to find a match in this portion of our /// trie). We know we can transition back to S5 because we've already seen /// a 'c' byte, so we don't need to re-scan it. We can then pick back up /// with the search starting at S5 and complete our match. /// /// Adding failure transitions to a trie is fairly simple, but subtle. The /// key issue is that you might have multiple failure transition that you /// need to follow. For example, look at the trie for the patterns /// 'abcd', 'b', 'bcd' and 'cd': /// /// ```ignore /// - a - S1 - b - S2 - c - S3 - d - S4* /// / /// S0 - b - S5* - c - S6 - d - S7* /// \ /// - c - S8 - d - S9* /// ``` /// /// The failure transitions for this trie are defined from S2 to S5, /// S3 to S6 and S6 to S8. Moreover, state S2 needs to track that it /// corresponds to a match, since its failure transition to S5 is itself /// a match state. /// /// Perhaps simplest way to think about adding these failure transitions /// is recursively. That is, if you know the failure transitions for every /// possible previous state that could be visited (e.g., when computing the /// failure transition for S3, you already know the failure transitions /// for S0, S1 and S2), then you can simply follow the failure transition /// of the previous state and check whether the incoming transition is /// defined after following the failure transition. /// /// For example, when determining the failure state for S3, by our /// assumptions, we already know that there is a failure transition from /// S2 (the previous state) to S5. So we follow that transition and check /// whether the transition connecting S2 to S3 is defined. Indeed, it is, /// as there is a transition from S5 to S6 for the byte 'c'. If no such /// transition existed, we could keep following the failure transitions /// until we reach the start state, which is the failure transition for /// every state that has no corresponding proper suffix. /// /// We don't actually use recursion to implement this, but instead, use a /// breadth first search of the automaton. Our base case is the start /// state, whose failure transition is just a transition to itself. fn fill_failure_transitions_standard(&mut self) { // Initialize the queue for breadth first search with all transitions // out of the start state. We handle the start state specially because // we only want to follow non-self transitions. If we followed self // transitions, then this would never terminate. let mut queue = VecDeque::new(); let mut seen = self.queued_set(); for b in AllBytesIter::new() { let next = self.nfa.start().next_state(b); if next != self.nfa.start_id { if !seen.contains(next) { queue.push_back(next); seen.insert(next); } } } while let Some(id) = queue.pop_front() { let mut it = self.nfa.iter_transitions_mut(id); while let Some((b, next)) = it.next() { if !seen.contains(next) { queue.push_back(next); seen.insert(next); } let mut fail = it.nfa().state(id).fail; while it.nfa().state(fail).next_state(b) == fail_id() { fail = it.nfa().state(fail).fail; } fail = it.nfa().state(fail).next_state(b); it.nfa().state_mut(next).fail = fail; it.nfa().copy_matches(fail, next); } // If the start state is a match state, then this automaton can // match the empty string. This implies all states are match states // since every position matches the empty string, so copy the // matches from the start state to every state. Strictly speaking, // this is only necessary for overlapping matches since each // non-empty non-start match state needs to report empty matches // in addition to its own. For the non-overlapping case, such // states only report the first match, which is never empty since // it isn't a start state. it.nfa().copy_empty_matches(id); } } /// This routine is just like fill_failure_transitions_standard, except /// it adds failure transitions in a way that preserves leftmost match /// semantics (for both leftmost-first and leftmost-longest). /// /// The algorithms are so similar that it would be possible to write it /// generically. But doing so without overhead would require a bit of /// ceremony, so we just copy it and add in the extra leftmost logic. /// Moreover, the standard algorithm above is so simple that it feels like /// a crime to disturb it. /// /// In effect, this proceeds just like the standard approach, but we /// specifically add only a subset of all failure transitions. Namely, we /// only add failure transitions that either do not occur after a match /// or failure transitions that do occur after a match but preserve the /// match. The comments in the implementation below should help. /// /// N.B. The only differences in the automaton between leftmost-first and /// leftmost-longest are in trie construction. Otherwise, both have exactly /// the same set of failure transitions. leftmost-longest adds everything /// to the trie, where as leftmost-first skips any patterns for which there /// exists a prefix of it that was added earlier. /// /// N.B. I came up with this algorithm on my own, and after scouring all of /// the other AC implementations I know of (Perl, Snort, many on GitHub). /// I couldn't find any that implement leftmost semantics like this. /// Perl of course needs leftmost-first semantics, but they implement it /// with a seeming hack at *search* time instead of encoding it into the /// automaton. There are also a couple Java libraries that support leftmost /// longest semantics, but they do it by building a queue of matches at /// search time, which is even worse than what Perl is doing. ---AG fn fill_failure_transitions_leftmost(&mut self) { /// Represents an item in our queue of states to process. /// /// Fundamentally, this queue serves the same purpose as the queue /// for filling failure transitions using the standard formulation. /// In the leftmost case, though, we need to track a bit more /// information. See comments below. #[derive(Clone, Copy, Debug)] struct QueuedState { /// The id of the state to visit. id: S, /// The depth at which the first match was observed in the path /// to this state. Note that this corresponds to the depth at /// which the beginning of the match was detected. If no match /// has been seen, then this is None. match_at_depth: Option, } impl QueuedState { /// Create a queued state corresponding to the given NFA's start /// state. fn start(nfa: &NFA) -> QueuedState { let match_at_depth = if nfa.start().is_match() { Some(0) } else { None }; QueuedState { id: nfa.start_id, match_at_depth } } /// Return the next state to queue up. The given id must be a state /// corresponding to a single transition from this queued state. fn next_queued_state( &self, nfa: &NFA, id: S, ) -> QueuedState { let match_at_depth = self.next_match_at_depth(nfa, id); QueuedState { id, match_at_depth } } /// Return the earliest depth at which a match has occurred for /// the given state. The given state must correspond to a single /// transition from this queued state. fn next_match_at_depth( &self, nfa: &NFA, next: S, ) -> Option { // This is a little tricky. If the previous state has already // seen a match or if `next` isn't a match state, then nothing // needs to change since a later state cannot find an earlier // match. match self.match_at_depth { Some(x) => return Some(x), None if nfa.state(next).is_match() => {} None => return None, } let depth = nfa.state(next).depth - nfa.state(next).get_longest_match_len().unwrap() + 1; Some(depth) } } // Initialize the queue for breadth first search with all transitions // out of the start state. We handle the start state specially because // we only want to follow non-self transitions. If we followed self // transitions, then this would never terminate. let mut queue: VecDeque> = VecDeque::new(); let mut seen = self.queued_set(); let start = QueuedState::start(&self.nfa); for b in AllBytesIter::new() { let next_id = self.nfa.start().next_state(b); if next_id != start.id { let next = start.next_queued_state(&self.nfa, next_id); if !seen.contains(next.id) { queue.push_back(next); seen.insert(next.id); } // If a state immediately following the start state is a match // state, then we never want to follow its failure transition // since the failure transition necessarily leads back to the // start state, which we never want to do for leftmost matching // after a match has been found. // // N.B. This is a special case of the more general handling // found below. if self.nfa.state(next_id).is_match() { self.nfa.state_mut(next_id).fail = dead_id(); } } } while let Some(item) = queue.pop_front() { let mut any_trans = false; let mut it = self.nfa.iter_transitions_mut(item.id); while let Some((b, next_id)) = it.next() { any_trans = true; // Queue up the next state. let next = item.next_queued_state(it.nfa(), next_id); if !seen.contains(next.id) { queue.push_back(next); seen.insert(next.id); } // Find the failure state for next. Same as standard. let mut fail = it.nfa().state(item.id).fail; while it.nfa().state(fail).next_state(b) == fail_id() { fail = it.nfa().state(fail).fail; } fail = it.nfa().state(fail).next_state(b); // This is the key difference from the standard formulation. // Namely, if we've seen a match, then we only want a failure // transition if the failure transition preserves the match // we've seen. In general, this is not true of all failure // transitions since they can point back to any suffix of what // we've seen so far. Instead, we only want to point back to // suffixes that contain any match we've seen. // // We achieve this by comparing the depth of the failure // transition with the number of states between this state // and the beginning of the earliest match detected. If the // depth of the failure state is smaller than this difference, // then it cannot contain the match. If it's bigger or equal // to the difference, then it necessarily includes the match // we've seen since all failure transitions correspond to a // suffix. // // If we've determined that we don't want the failure // transition, then we set this state's failure transition to // the dead state. In other words, when a search hits this // state, it will not continue and correctly stop. (N.B. A // dead state is different than a fail state. A dead state // MUST be preceded by a match and acts as a sentinel to search // routines to terminate.) // // Understanding this is tricky, and it took me several days // to think through this and get it right. If you want to grok // it, then I'd recommend: 1) switch the implementation to // always use the standard algorithm for filling in failure // transitions, 2) run the test suite and 3) examine the test // failures. Write out the automatons for them and try to work // backwards by figuring out which failure transitions should // be removed. You should arrive at the same rule used below. if let Some(match_depth) = next.match_at_depth { let fail_depth = it.nfa().state(fail).depth; let next_depth = it.nfa().state(next.id).depth; if next_depth - match_depth + 1 > fail_depth { it.nfa().state_mut(next.id).fail = dead_id(); continue; } assert_ne!( start.id, it.nfa().state(next.id).fail, "states that are match states or follow match \ states should never have a failure transition \ back to the start state in leftmost searching", ); } it.nfa().state_mut(next.id).fail = fail; it.nfa().copy_matches(fail, next.id); } // If there are no transitions for this state and if it's a match // state, then we must set its failure transition to the dead // state since we never want it to restart the search. if !any_trans && it.nfa().state(item.id).is_match() { it.nfa().state_mut(item.id).fail = dead_id(); } // We don't need to copy empty matches from the start state here // because that's only necessary for overlapping matches and // leftmost match kinds don't support overlapping matches. } } /// Returns a set that tracked queued states. /// /// This is only necessary when ASCII case insensitivity is enabled, since /// it is the only way to visit the same state twice. Otherwise, this /// returns an inert set that nevers adds anything and always reports /// `false` for every member test. fn queued_set(&self) -> QueuedSet { if self.builder.ascii_case_insensitive { QueuedSet::active() } else { QueuedSet::inert() } } /// Set the failure transitions on the start state to loop back to the /// start state. This effectively permits the Aho-Corasick automaton to /// match at any position. This is also required for finding the next /// state to terminate, namely, finding the next state should never return /// a fail_id. /// /// This must be done after building the initial trie, since trie /// construction depends on transitions to `fail_id` to determine whether a /// state already exists or not. fn add_start_state_loop(&mut self) { let start_id = self.nfa.start_id; let start = self.nfa.start_mut(); for b in AllBytesIter::new() { if start.next_state(b) == fail_id() { start.set_next_state(b, start_id); } } } /// Remove the start state loop by rewriting any transitions on the start /// state back to the start state with transitions to the dead state. /// /// The loop is only closed when two conditions are met: the start state /// is a match state and the match kind is leftmost-first or /// leftmost-longest. (Alternatively, if this is an anchored automaton, /// then the start state is always closed, regardless of aforementioned /// conditions.) /// /// The reason for this is that under leftmost semantics, a start state /// that is also a match implies that we should never restart the search /// process. We allow normal transitions out of the start state, but if /// none exist, we transition to the dead state, which signals that /// searching should stop. fn close_start_state_loop(&mut self) { if self.builder.anchored || (self.match_kind().is_leftmost() && self.nfa.start().is_match()) { let start_id = self.nfa.start_id; let start = self.nfa.start_mut(); for b in AllBytesIter::new() { if start.next_state(b) == start_id { start.set_next_state(b, dead_id()); } } } } /// Sets all transitions on the dead state to point back to the dead state. /// Normally, missing transitions map back to the failure state, but the /// point of the dead state is to act as a sink that can never be escaped. fn add_dead_state_loop(&mut self) { let dead = self.nfa.state_mut(dead_id()); for b in AllBytesIter::new() { dead.set_next_state(b, dead_id()); } } /// Computes the total amount of heap used by this NFA in bytes. fn calculate_size(&mut self) { let mut size = 0; for state in &self.nfa.states { size += state.heap_bytes(); } self.nfa.heap_bytes = size; } /// Add a new state to the underlying NFA with the given depth. The depth /// is used to determine how to represent the transitions. /// /// If adding the new state would overflow the chosen state ID /// representation, then this returns an error. fn add_state(&mut self, depth: usize) -> Result { if depth < self.builder.dense_depth { self.nfa.add_dense_state(depth) } else { self.nfa.add_sparse_state(depth) } } /// Returns the match kind configured on the underlying builder. fn match_kind(&self) -> MatchKind { self.builder.match_kind } } /// A set of state identifiers used to avoid revisiting the same state multiple /// times when filling in failure transitions. /// /// This set has an "inert" and an "active" mode. When inert, the set never /// stores anything and always returns `false` for every member test. This is /// useful to avoid the performance and memory overhead of maintaining this /// set when it is not needed. #[derive(Debug)] struct QueuedSet { set: Option>, } impl QueuedSet { /// Return an inert set that returns `false` for every state ID membership /// test. fn inert() -> QueuedSet { QueuedSet { set: None } } /// Return an active set that tracks state ID membership. fn active() -> QueuedSet { QueuedSet { set: Some(BTreeSet::new()) } } /// Inserts the given state ID into this set. (If the set is inert, then /// this is a no-op.) fn insert(&mut self, state_id: S) { if let Some(ref mut set) = self.set { set.insert(state_id); } } /// Returns true if and only if the given state ID is in this set. If the /// set is inert, this always returns false. fn contains(&self, state_id: S) -> bool { match self.set { None => false, Some(ref set) => set.contains(&state_id), } } } /// An iterator over every byte value. /// /// We use this instead of (0..256).map(|b| b as u8) because this optimizes /// better in debug builds. /// /// We also use this instead of 0..=255 because we're targeting Rust 1.24 and /// inclusive range syntax was stabilized in Rust 1.26. We can get rid of this /// once our MSRV is Rust 1.26 or newer. #[derive(Debug)] struct AllBytesIter(u16); impl AllBytesIter { fn new() -> AllBytesIter { AllBytesIter(0) } } impl Iterator for AllBytesIter { type Item = u8; fn next(&mut self) -> Option { if self.0 >= 256 { None } else { let b = self.0 as u8; self.0 += 1; Some(b) } } } impl fmt::Debug for NFA { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { writeln!(f, "NFA(")?; writeln!(f, "match_kind: {:?}", self.match_kind)?; writeln!(f, "{}", "-".repeat(79))?; for (id, s) in self.states.iter().enumerate() { let mut trans = vec![]; s.trans.iter(|byte, next| { // The start state has a bunch of uninteresting transitions // back into itself. It's questionable to hide them since they // are critical to understanding the automaton, but they are // very noisy without better formatting for contiugous ranges // to the same state. if id == self.start_id.to_usize() && next == self.start_id { return; } // Similarly, the dead state has a bunch of uninteresting // transitions too. if id == dead_id() { return; } trans.push(format!("{} => {}", escape(byte), next.to_usize())); }); writeln!(f, "{:04}: {}", id, trans.join(", "))?; let matches: Vec = s .matches .iter() .map(|&(pattern_id, _)| pattern_id.to_string()) .collect(); writeln!(f, " matches: {}", matches.join(", "))?; writeln!(f, " fail: {}", s.fail.to_usize())?; writeln!(f, " depth: {}", s.depth)?; } writeln!(f, "{}", "-".repeat(79))?; writeln!(f, ")")?; Ok(()) } } /// Iterate over all possible byte transitions given a sparse set. fn sparse_iter(trans: &[(u8, S)], mut f: F) { let mut byte = 0u16; for &(b, id) in trans { while byte < (b as u16) { f(byte as u8, fail_id()); byte += 1; } f(b, id); byte += 1; } for b in byte..256 { f(b as u8, fail_id()); } } /// Safely return two mutable borrows to two different locations in the given /// slice. /// /// This panics if i == j. fn get_two_mut(xs: &mut [T], i: usize, j: usize) -> (&mut T, &mut T) { assert!(i != j, "{} must not be equal to {}", i, j); if i < j { let (before, after) = xs.split_at_mut(j); (&mut before[i], &mut after[0]) } else { let (before, after) = xs.split_at_mut(i); (&mut after[0], &mut before[j]) } } /// Return the given byte as its escaped string form. fn escape(b: u8) -> String { use std::ascii; String::from_utf8(ascii::escape_default(b).collect::>()).unwrap() } #[cfg(test)] mod tests { use super::*; #[test] fn scratch() { let nfa: NFA = Builder::new() .dense_depth(0) // .match_kind(MatchKind::LeftmostShortest) // .match_kind(MatchKind::LeftmostLongest) .match_kind(MatchKind::LeftmostFirst) // .build(&["abcd", "ce", "b"]) // .build(&["ab", "bc"]) // .build(&["b", "bcd", "ce"]) // .build(&["abc", "bx"]) // .build(&["abc", "bd", "ab"]) // .build(&["abcdefghi", "hz", "abcdefgh"]) // .build(&["abcd", "bce", "b"]) .build(&["abcdefg", "bcde", "bcdef"]) .unwrap(); println!("{:?}", nfa); } } aho-corasick-0.7.8/src/packed/api.rs010064400017500000144000000503321361064541200154500ustar0000000000000000use std::u16; use packed::pattern::Patterns; use packed::rabinkarp::RabinKarp; use packed::teddy::{self, Teddy}; use Match; /// This is a limit placed on the total number of patterns we're willing to try /// and match at once. As more sophisticated algorithms are added, this number /// may be increased. const PATTERN_LIMIT: usize = 128; /// A knob for controlling the match semantics of a packed multiple string /// searcher. /// /// This differs from the /// [`MatchKind`](../enum.MatchKind.html) /// type in the top-level crate module in that it doesn't support /// "standard" match semantics, and instead only supports leftmost-first or /// leftmost-longest. Namely, "standard" semantics cannot be easily supported /// by packed searchers. /// /// For more information on the distinction between leftmost-first and /// leftmost-longest, see the docs on the top-level `MatchKind` type. /// /// Unlike the top-level `MatchKind` type, the default match semantics for this /// type are leftmost-first. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum MatchKind { /// Use leftmost-first match semantics, which reports leftmost matches. /// When there are multiple possible leftmost matches, the match /// corresponding to the pattern that appeared earlier when constructing /// the automaton is reported. /// /// This is the default. LeftmostFirst, /// Use leftmost-longest match semantics, which reports leftmost matches. /// When there are multiple possible leftmost matches, the longest match /// is chosen. LeftmostLongest, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } impl Default for MatchKind { fn default() -> MatchKind { MatchKind::LeftmostFirst } } /// The configuration for a packed multiple pattern searcher. /// /// The configuration is currently limited only to being able to select the /// match semantics (leftmost-first or leftmost-longest) of a searcher. In the /// future, more knobs may be made available. /// /// A configuration produces a [`packed::Builder`](struct.Builder.html), which /// in turn can be used to construct a /// [`packed::Searcher`](struct.Searcher.html) for searching. /// /// # Example /// /// This example shows how to use leftmost-longest semantics instead of the /// default (leftmost-first). /// /// ``` /// use aho_corasick::packed::{Config, MatchKind}; /// /// # fn example() -> Option<()> { /// let searcher = Config::new() /// .match_kind(MatchKind::LeftmostLongest) /// .builder() /// .add("foo") /// .add("foobar") /// .build()?; /// let matches: Vec = searcher /// .find_iter("foobar") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![1], matches); /// # Some(()) } /// # if cfg!(target_arch = "x86_64") { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[derive(Clone, Debug)] pub struct Config { kind: MatchKind, force: Option, force_teddy_fat: Option, force_avx: Option, } /// An internal option for forcing the use of a particular packed algorithm. /// /// When an algorithm is forced, if a searcher could not be constructed for it, /// then no searcher will be returned even if an alternative algorithm would /// work. #[derive(Clone, Debug)] enum ForceAlgorithm { Teddy, RabinKarp, } impl Default for Config { fn default() -> Config { Config::new() } } impl Config { /// Create a new default configuration. A default configuration uses /// leftmost-first match semantics. pub fn new() -> Config { Config { kind: MatchKind::LeftmostFirst, force: None, force_teddy_fat: None, force_avx: None, } } /// Create a packed builder from this configuration. The builder can be /// used to accumulate patterns and create a /// [`Searcher`](struct.Searcher.html) /// from them. pub fn builder(&self) -> Builder { Builder::from_config(self.clone()) } /// Set the match semantics for this configuration. pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { self.kind = kind; self } /// An undocumented method for forcing the use of the Teddy algorithm. /// /// This is only exposed for more precise testing and benchmarks. Callers /// should not use it as it is not part of the API stability guarantees of /// this crate. #[doc(hidden)] pub fn force_teddy(&mut self, yes: bool) -> &mut Config { if yes { self.force = Some(ForceAlgorithm::Teddy); } else { self.force = None; } self } /// An undocumented method for forcing the use of the Fat Teddy algorithm. /// /// This is only exposed for more precise testing and benchmarks. Callers /// should not use it as it is not part of the API stability guarantees of /// this crate. #[doc(hidden)] pub fn force_teddy_fat(&mut self, yes: Option) -> &mut Config { self.force_teddy_fat = yes; self } /// An undocumented method for forcing the use of SSE (`Some(false)`) or /// AVX (`Some(true)`) algorithms. /// /// This is only exposed for more precise testing and benchmarks. Callers /// should not use it as it is not part of the API stability guarantees of /// this crate. #[doc(hidden)] pub fn force_avx(&mut self, yes: Option) -> &mut Config { self.force_avx = yes; self } /// An undocumented method for forcing the use of the Rabin-Karp algorithm. /// /// This is only exposed for more precise testing and benchmarks. Callers /// should not use it as it is not part of the API stability guarantees of /// this crate. #[doc(hidden)] pub fn force_rabin_karp(&mut self, yes: bool) -> &mut Config { if yes { self.force = Some(ForceAlgorithm::RabinKarp); } else { self.force = None; } self } } /// A builder for constructing a packed searcher from a collection of patterns. /// /// # Example /// /// This example shows how to use a builder to construct a searcher. By /// default, leftmost-first match semantics are used. /// /// ``` /// use aho_corasick::packed::{Builder, MatchKind}; /// /// # fn example() -> Option<()> { /// let searcher = Builder::new() /// .add("foobar") /// .add("foo") /// .build()?; /// let matches: Vec = searcher /// .find_iter("foobar") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![0], matches); /// # Some(()) } /// # if cfg!(target_arch = "x86_64") { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[derive(Clone, Debug)] pub struct Builder { /// The configuration of this builder and subsequent matcher. config: Config, /// Set to true if the builder detects that a matcher cannot be built. inert: bool, /// The patterns provided by the caller. patterns: Patterns, } impl Builder { /// Create a new builder for constructing a multi-pattern searcher. This /// constructor uses the default configuration. pub fn new() -> Builder { Builder::from_config(Config::new()) } fn from_config(config: Config) -> Builder { Builder { config, inert: false, patterns: Patterns::new() } } /// Build a searcher from the patterns added to this builder so far. pub fn build(&self) -> Option { if self.inert || self.patterns.is_empty() { return None; } let mut patterns = self.patterns.clone(); patterns.set_match_kind(self.config.kind); let rabinkarp = RabinKarp::new(&patterns); // Effectively, we only want to return a searcher if we can use Teddy, // since Teddy is our only fast packed searcher at the moment. // Rabin-Karp is only used when searching haystacks smaller than what // Teddy can support. Thus, the only way to get a Rabin-Karp searcher // is to force it using undocumented APIs (for tests/benchmarks). let (search_kind, minimum_len) = match self.config.force { None | Some(ForceAlgorithm::Teddy) => { let teddy = match self.build_teddy(&patterns) { None => return None, Some(teddy) => teddy, }; let minimum_len = teddy.minimum_len(); (SearchKind::Teddy(teddy), minimum_len) } Some(ForceAlgorithm::RabinKarp) => (SearchKind::RabinKarp, 0), }; Some(Searcher { config: self.config.clone(), patterns: patterns, rabinkarp: rabinkarp, search_kind, minimum_len, }) } fn build_teddy(&self, patterns: &Patterns) -> Option { teddy::Builder::new() .avx(self.config.force_avx) .fat(self.config.force_teddy_fat) .build(&patterns) } /// Add the given pattern to this set to match. /// /// The order in which patterns are added is significant. Namely, when /// using leftmost-first match semantics, then when multiple patterns can /// match at a particular location, the pattern that was added first is /// used as the match. /// /// If the number of patterns added exceeds the amount supported by packed /// searchers, then the builder will stop accumulating patterns and render /// itself inert. At this point, constructing a searcher will always return /// `None`. pub fn add>(&mut self, pattern: P) -> &mut Builder { if self.inert { return self; } else if self.patterns.len() >= PATTERN_LIMIT { self.inert = true; self.patterns.reset(); return self; } // Just in case PATTERN_LIMIT increases beyond u16::MAX. assert!(self.patterns.len() <= u16::MAX as usize); let pattern = pattern.as_ref(); if pattern.is_empty() { self.inert = true; self.patterns.reset(); return self; } self.patterns.add(pattern); self } /// Add the given iterator of patterns to this set to match. /// /// The iterator must yield elements that can be converted into a `&[u8]`. /// /// The order in which patterns are added is significant. Namely, when /// using leftmost-first match semantics, then when multiple patterns can /// match at a particular location, the pattern that was added first is /// used as the match. /// /// If the number of patterns added exceeds the amount supported by packed /// searchers, then the builder will stop accumulating patterns and render /// itself inert. At this point, constructing a searcher will always return /// `None`. pub fn extend(&mut self, patterns: I) -> &mut Builder where I: IntoIterator, P: AsRef<[u8]>, { for p in patterns { self.add(p); } self } } impl Default for Builder { fn default() -> Builder { Builder::new() } } /// A packed searcher for quickly finding occurrences of multiple patterns. /// /// If callers need more flexible construction, or if one wants to change the /// match semantics (either leftmost-first or leftmost-longest), then one can /// use the [`Config`](struct.Config.html) and/or /// [`Builder`](struct.Builder.html) types for more fine grained control. /// /// # Example /// /// This example shows how to create a searcher from an iterator of patterns. /// By default, leftmost-first match semantics are used. /// /// ``` /// use aho_corasick::packed::{MatchKind, Searcher}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let matches: Vec = searcher /// .find_iter("foobar") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![0], matches); /// # Some(()) } /// # if cfg!(target_arch = "x86_64") { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` #[derive(Clone, Debug)] pub struct Searcher { config: Config, patterns: Patterns, rabinkarp: RabinKarp, search_kind: SearchKind, minimum_len: usize, } #[derive(Clone, Debug)] enum SearchKind { Teddy(Teddy), RabinKarp, } impl Searcher { /// A convenience function for constructing a searcher from an iterator /// of things that can be converted to a `&[u8]`. /// /// If a searcher could not be constructed (either because of an /// unsupported CPU or because there are too many patterns), then `None` /// is returned. /// /// # Example /// /// Basic usage: /// /// ``` /// use aho_corasick::packed::{MatchKind, Searcher}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let matches: Vec = searcher /// .find_iter("foobar") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![0], matches); /// # Some(()) } /// # if cfg!(target_arch = "x86_64") { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` pub fn new(patterns: I) -> Option where I: IntoIterator, P: AsRef<[u8]>, { Builder::new().extend(patterns).build() } /// Return the first occurrence of any of the patterns in this searcher, /// according to its match semantics, in the given haystack. The `Match` /// returned will include the identifier of the pattern that matched, which /// corresponds to the index of the pattern (starting from `0`) in which it /// was added. /// /// # Example /// /// Basic usage: /// /// ``` /// use aho_corasick::packed::{MatchKind, Searcher}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let mat = searcher.find("foobar")?; /// assert_eq!(0, mat.pattern()); /// assert_eq!(0, mat.start()); /// assert_eq!(6, mat.end()); /// # Some(()) } /// # if cfg!(target_arch = "x86_64") { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` pub fn find>(&self, haystack: B) -> Option { self.find_at(haystack, 0) } /// Return the first occurrence of any of the patterns in this searcher, /// according to its match semantics, in the given haystack starting from /// the given position. /// /// The `Match` returned will include the identifier of the pattern that /// matched, which corresponds to the index of the pattern (starting from /// `0`) in which it was added. The offsets in the `Match` will be relative /// to the start of `haystack` (and not `at`). /// /// # Example /// /// Basic usage: /// /// ``` /// use aho_corasick::packed::{MatchKind, Searcher}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let mat = searcher.find_at("foofoobar", 3)?; /// assert_eq!(0, mat.pattern()); /// assert_eq!(3, mat.start()); /// assert_eq!(9, mat.end()); /// # Some(()) } /// # if cfg!(target_arch = "x86_64") { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` pub fn find_at>( &self, haystack: B, at: usize, ) -> Option { let haystack = haystack.as_ref(); match self.search_kind { SearchKind::Teddy(ref teddy) => { if haystack[at..].len() < teddy.minimum_len() { return self.slow_at(haystack, at); } teddy.find_at(&self.patterns, haystack, at) } SearchKind::RabinKarp => { self.rabinkarp.find_at(&self.patterns, haystack, at) } } } /// Return an iterator of non-overlapping occurrences of the patterns in /// this searcher, according to its match semantics, in the given haystack. /// /// # Example /// /// Basic usage: /// /// ``` /// use aho_corasick::packed::{MatchKind, Searcher}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// let matches: Vec = searcher /// .find_iter("foobar fooba foofoo") /// .map(|mat| mat.pattern()) /// .collect(); /// assert_eq!(vec![0, 1, 1, 1], matches); /// # Some(()) } /// # if cfg!(target_arch = "x86_64") { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>( &'a self, haystack: &'b B, ) -> FindIter<'a, 'b> { FindIter { searcher: self, haystack: haystack.as_ref(), at: 0 } } /// Returns the match kind used by this packed searcher. /// /// # Examples /// /// Basic usage: /// /// ``` /// use aho_corasick::packed::{MatchKind, Searcher}; /// /// # fn example() -> Option<()> { /// let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; /// // leftmost-first is the default. /// assert_eq!(&MatchKind::LeftmostFirst, searcher.match_kind()); /// # Some(()) } /// # if cfg!(target_arch = "x86_64") { /// # example().unwrap() /// # } else { /// # assert!(example().is_none()); /// # } /// ``` pub fn match_kind(&self) -> &MatchKind { self.patterns.match_kind() } /// Returns the minimum length of a haystack that is required in order for /// packed searching to be effective. /// /// In some cases, the underlying packed searcher may not be able to search /// very short haystacks. When that occurs, the implementation will defer /// to a slower non-packed searcher (which is still generally faster than /// Aho-Corasick for a small number of patterns). However, callers may /// want to avoid ever using the slower variant, which one can do by /// never passing a haystack shorter than the minimum length returned by /// this method. pub fn minimum_len(&self) -> usize { self.minimum_len } /// Returns the approximate total amount of heap used by this searcher, in /// units of bytes. pub fn heap_bytes(&self) -> usize { self.patterns.heap_bytes() + self.rabinkarp.heap_bytes() + self.search_kind.heap_bytes() } /// Use a slow (non-packed) searcher. /// /// This is useful when a packed searcher could be constructed, but could /// not be used to search a specific haystack. For example, if Teddy was /// built but the haystack is smaller than ~34 bytes, then Teddy might not /// be able to run. fn slow_at(&self, haystack: &[u8], at: usize) -> Option { self.rabinkarp.find_at(&self.patterns, haystack, at) } } impl SearchKind { fn heap_bytes(&self) -> usize { match *self { SearchKind::Teddy(ref ted) => ted.heap_bytes(), SearchKind::RabinKarp => 0, } } } /// An iterator over non-overlapping matches from a packed searcher. /// /// The lifetime `'s` refers to the lifetime of the underlying /// [`Searcher`](struct.Searcher.html), while the lifetime `'h` refers to the /// lifetime of the haystack being searched. #[derive(Debug)] pub struct FindIter<'s, 'h> { searcher: &'s Searcher, haystack: &'h [u8], at: usize, } impl<'s, 'h> Iterator for FindIter<'s, 'h> { type Item = Match; fn next(&mut self) -> Option { if self.at > self.haystack.len() { return None; } match self.searcher.find_at(&self.haystack, self.at) { None => None, Some(c) => { self.at = c.end; Some(c) } } } } aho-corasick-0.7.8/src/packed/mod.rs010064400017500000144000000103241352131022200154400ustar0000000000000000/*! A lower level API for packed multiple substring search, principally for a small number of patterns. This sub-module provides vectorized routines for quickly finding matches of a small number of patterns. In general, users of this crate shouldn't need to interface with this module directory, as the primary [`AhoCorasick`](../struct.AhoCorasick.html) searcher will use these routines automatically as a prefilter when applicable. However, in some cases, callers may want to bypass the Aho-Corasick machinery entirely and use this vectorized searcher directly. # Overview The primary types in this sub-module are: * [`Searcher`](struct.Searcher.html) executes the actual search algorithm to report matches in a haystack. * [`Builder`](struct.Builder.html) accumulates patterns incrementally and can construct a `Searcher`. * [`Config`](struct.Config.html) permits tuning the searcher, and itself will produce a `Builder` (which can then be used to build a `Searcher`). Currently, the only tuneable knob are the match semantics, but this may be expanded in the future. # Examples This example shows how to create a searcher from an iterator of patterns. By default, leftmost-first match semantics are used. (See the top-level [`MatchKind`](../enum.MatchKind.html) type for more details about match semantics, which apply similarly to packed substring search.) ``` use aho_corasick::packed::{MatchKind, Searcher}; # fn example() -> Option<()> { let searcher = Searcher::new(["foobar", "foo"].iter().cloned())?; let matches: Vec = searcher .find_iter("foobar") .map(|mat| mat.pattern()) .collect(); assert_eq!(vec![0], matches); # Some(()) } # if cfg!(target_arch = "x86_64") { # example().unwrap() # } else { # assert!(example().is_none()); # } ``` This example shows how to use [`Config`](struct.Config.html) to change the match semantics to leftmost-longest: ``` use aho_corasick::packed::{Config, MatchKind}; # fn example() -> Option<()> { let searcher = Config::new() .match_kind(MatchKind::LeftmostLongest) .builder() .add("foo") .add("foobar") .build()?; let matches: Vec = searcher .find_iter("foobar") .map(|mat| mat.pattern()) .collect(); assert_eq!(vec![1], matches); # Some(()) } # if cfg!(target_arch = "x86_64") { # example().unwrap() # } else { # assert!(example().is_none()); # } ``` # Packed substring searching Packed substring searching refers to the use of SIMD (Single Instruction, Multiple Data) to accelerate the detection of matches in a haystack. Unlike conventional algorithms, such as Aho-Corasick, SIMD algorithms for substring search tend to do better with a small number of patterns, where as Aho-Corasick generally maintains reasonably consistent performance regardless of the number of patterns you give it. Because of this, the vectorized searcher in this sub-module cannot be used as a general purpose searcher, since building the searcher may fail. However, in exchange, when searching for a small number of patterns, searching can be quite a bit faster than Aho-Corasick (sometimes by an order of magnitude). The key take away here is that constructing a searcher from a list of patterns is a fallible operation. While the precise conditions under which building a searcher can fail is specifically an implementation detail, here are some common reasons: * Too many patterns were given. Typically, the limit is on the order of 100 or so, but this limit may fluctuate based on available CPU features. * The available packed algorithms require CPU features that aren't available. For example, currently, this crate only provides packed algorithms for `x86_64`. Therefore, constructing a packed searcher on any other target (e.g., ARM) will always fail. * Zero patterns were given, or one of the patterns given was empty. Packed searchers require at least one pattern and that all patterns are non-empty. * Something else about the nature of the patterns (typically based on heuristics) suggests that a packed searcher would perform very poorly, so no searcher is built. */ pub use packed::api::{Builder, Config, FindIter, MatchKind, Searcher}; mod api; mod pattern; mod rabinkarp; mod teddy; #[cfg(test)] mod tests; #[cfg(target_arch = "x86_64")] mod vector; aho-corasick-0.7.8/src/packed/pattern.rs010064400017500000144000000261311352132431400163500ustar0000000000000000use std::cmp; use std::fmt; use std::mem; use std::u16; use std::usize; use packed::api::MatchKind; /// The type used for representing a pattern identifier. /// /// We don't use `usize` here because our packed searchers don't scale to /// huge numbers of patterns, so we keep things a bit smaller. pub type PatternID = u16; /// A non-empty collection of non-empty patterns to search for. /// /// This collection of patterns is what is passed around to both execute /// searches and to construct the searchers themselves. Namely, this permits /// searches to avoid copying all of the patterns, and allows us to keep only /// one copy throughout all packed searchers. /// /// Note that this collection is not a set. The same pattern can appear more /// than once. #[derive(Clone, Debug)] pub struct Patterns { /// The match semantics supported by this collection of patterns. /// /// The match semantics determines the order of the iterator over patterns. /// For leftmost-first, patterns are provided in the same order as were /// provided by the caller. For leftmost-longest, patterns are provided in /// descending order of length, with ties broken by the order in which they /// were provided by the caller. kind: MatchKind, /// The collection of patterns, indexed by their identifier. by_id: Vec>, /// The order of patterns defined for iteration, given by pattern /// identifiers. The order of `by_id` and `order` is always the same for /// leftmost-first semantics, but may be different for leftmost-longest /// semantics. order: Vec, /// The length of the smallest pattern, in bytes. minimum_len: usize, /// The largest pattern identifier. This should always be equivalent to /// the number of patterns minus one in this collection. max_pattern_id: PatternID, /// The total number of pattern bytes across the entire collection. This /// is used for reporting total heap usage in constant time. total_pattern_bytes: usize, } impl Patterns { /// Create a new collection of patterns for the given match semantics. The /// ID of each pattern is the index of the pattern at which it occurs in /// the `by_id` slice. /// /// If any of the patterns in the slice given are empty, then this panics. /// Similarly, if the number of patterns given is zero, then this also /// panics. pub fn new() -> Patterns { Patterns { kind: MatchKind::default(), by_id: vec![], order: vec![], minimum_len: usize::MAX, max_pattern_id: 0, total_pattern_bytes: 0, } } /// Add a pattern to this collection. /// /// This panics if the pattern given is empty. pub fn add(&mut self, bytes: &[u8]) { assert!(!bytes.is_empty()); assert!(self.by_id.len() <= u16::MAX as usize); let id = self.by_id.len() as u16; self.max_pattern_id = id; self.order.push(id); self.by_id.push(bytes.to_vec()); self.minimum_len = cmp::min(self.minimum_len, bytes.len()); self.total_pattern_bytes += bytes.len(); } /// Set the match kind semantics for this collection of patterns. /// /// If the kind is not set, then the default is leftmost-first. pub fn set_match_kind(&mut self, kind: MatchKind) { match kind { MatchKind::LeftmostFirst => { self.order.sort(); } MatchKind::LeftmostLongest => { let (order, by_id) = (&mut self.order, &mut self.by_id); order.sort_by(|&id1, &id2| { by_id[id1 as usize] .len() .cmp(&by_id[id2 as usize].len()) .reverse() }); } MatchKind::__Nonexhaustive => unreachable!(), } } /// Return the number of patterns in this collection. /// /// This is guaranteed to be greater than zero. pub fn len(&self) -> usize { self.by_id.len() } /// Returns true if and only if this collection of patterns is empty. pub fn is_empty(&self) -> bool { self.len() == 0 } /// Returns the approximate total amount of heap used by these patterns, in /// units of bytes. pub fn heap_bytes(&self) -> usize { self.order.len() * mem::size_of::() + self.by_id.len() * mem::size_of::>() + self.total_pattern_bytes } /// Clears all heap memory associated with this collection of patterns and /// resets all state such that it is a valid empty collection. pub fn reset(&mut self) { self.kind = MatchKind::default(); self.by_id.clear(); self.order.clear(); self.minimum_len = usize::MAX; self.max_pattern_id = 0; } /// Return the maximum pattern identifier in this collection. This can be /// useful in searchers for ensuring that the collection of patterns they /// are provided at search time and at build time have the same size. pub fn max_pattern_id(&self) -> PatternID { assert_eq!((self.max_pattern_id + 1) as usize, self.len()); self.max_pattern_id } /// Returns the length, in bytes, of the smallest pattern. /// /// This is guaranteed to be at least one. pub fn minimum_len(&self) -> usize { self.minimum_len } /// Returns the match semantics used by these patterns. pub fn match_kind(&self) -> &MatchKind { &self.kind } /// Return the pattern with the given identifier. If such a pattern does /// not exist, then this panics. pub fn get(&self, id: PatternID) -> Pattern { Pattern(&self.by_id[id as usize]) } /// Return the pattern with the given identifier without performing bounds /// checks. /// /// # Safety /// /// Callers must ensure that a pattern with the given identifier exists /// before using this method. #[cfg(target_arch = "x86_64")] pub unsafe fn get_unchecked(&self, id: PatternID) -> Pattern { Pattern(self.by_id.get_unchecked(id as usize)) } /// Return an iterator over all the patterns in this collection, in the /// order in which they should be matched. /// /// Specifically, in a naive multi-pattern matcher, the following is /// guaranteed to satisfy the match semantics of this collection of /// patterns: /// /// ```ignore /// for i in 0..haystack.len(): /// for p in patterns.iter(): /// if haystack[i..].starts_with(p.bytes()): /// return Match(p.id(), i, i + p.bytes().len()) /// ``` /// /// Namely, among the patterns in a collection, if they are matched in /// the order provided by this iterator, then the result is guaranteed /// to satisfy the correct match semantics. (Either leftmost-first or /// leftmost-longest.) pub fn iter(&self) -> PatternIter { PatternIter { patterns: self, i: 0 } } } /// An iterator over the patterns in the `Patterns` collection. /// /// The order of the patterns provided by this iterator is consistent with the /// match semantics of the originating collection of patterns. /// /// The lifetime `'p` corresponds to the lifetime of the collection of patterns /// this is iterating over. #[derive(Debug)] pub struct PatternIter<'p> { patterns: &'p Patterns, i: usize, } impl<'p> Iterator for PatternIter<'p> { type Item = (PatternID, Pattern<'p>); fn next(&mut self) -> Option<(PatternID, Pattern<'p>)> { if self.i >= self.patterns.len() { return None; } let id = self.patterns.order[self.i]; let p = self.patterns.get(id); self.i += 1; Some((id, p)) } } /// A pattern that is used in packed searching. #[derive(Clone)] pub struct Pattern<'a>(&'a [u8]); impl<'a> fmt::Debug for Pattern<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Pattern") .field("lit", &String::from_utf8_lossy(&self.0)) .finish() } } impl<'p> Pattern<'p> { /// Returns the length of this pattern, in bytes. pub fn len(&self) -> usize { self.0.len() } /// Returns the bytes of this pattern. pub fn bytes(&self) -> &[u8] { &self.0 } /// Returns the first `len` low nybbles from this pattern. If this pattern /// is shorter than `len`, then this panics. #[cfg(target_arch = "x86_64")] pub fn low_nybbles(&self, len: usize) -> Vec { let mut nybs = vec![]; for &b in self.bytes().iter().take(len) { nybs.push(b & 0xF); } nybs } /// Returns true if this pattern is a prefix of the given bytes. #[inline(always)] pub fn is_prefix(&self, bytes: &[u8]) -> bool { self.len() <= bytes.len() && self.equals(&bytes[..self.len()]) } /// Returns true if and only if this pattern equals the given bytes. #[inline(always)] pub fn equals(&self, bytes: &[u8]) -> bool { // Why not just use memcmp for this? Well, memcmp requires calling out // to libc, and this routine is called in fairly hot code paths. Other // than just calling out to libc, it also seems to result in worse // codegen. By rolling our own memcpy in pure Rust, it seems to appear // more friendly to the optimizer. // // This results in an improvement in just about every benchmark. Some // smaller than others, but in some cases, up to 30% faster. if self.len() != bytes.len() { return false; } if self.len() < 8 { for (&b1, &b2) in self.bytes().iter().zip(bytes) { if b1 != b2 { return false; } } return true; } // When we have 8 or more bytes to compare, then proceed in chunks of // 8 at a time using unaligned loads. let mut p1 = self.bytes().as_ptr(); let mut p2 = bytes.as_ptr(); let p1end = self.bytes()[self.len() - 8..].as_ptr(); let p2end = bytes[bytes.len() - 8..].as_ptr(); // SAFETY: Via the conditional above, we know that both `p1` and `p2` // have the same length, so `p1 < p1end` implies that `p2 < p2end`. // Thus, derefencing both `p1` and `p2` in the loop below is safe. // // Moreover, we set `p1end` and `p2end` to be 8 bytes before the actual // end of of `p1` and `p2`. Thus, the final dereference outside of the // loop is guaranteed to be valid. // // Finally, we needn't worry about 64-bit alignment here, since we // do unaligned loads. unsafe { while p1 < p1end { let v1 = (p1 as *const u64).read_unaligned(); let v2 = (p2 as *const u64).read_unaligned(); if v1 != v2 { return false; } p1 = p1.add(8); p2 = p2.add(8); } let v1 = (p1end as *const u64).read_unaligned(); let v2 = (p2end as *const u64).read_unaligned(); v1 == v2 } } } aho-corasick-0.7.8/src/packed/rabinkarp.rs010064400017500000144000000160131352132431400166420ustar0000000000000000use std::mem; use packed::pattern::{PatternID, Patterns}; use Match; /// The type of the rolling hash used in the Rabin-Karp algorithm. type Hash = usize; /// The number of buckets to store our patterns in. We don't want this to be /// too big in order to avoid wasting memory, but we don't want it to be too /// small either to avoid spending too much time confirming literals. /// /// The number of buckets MUST be a power of two. Otherwise, determining the /// bucket from a hash will slow down the code considerably. Using a power /// of two means `hash % NUM_BUCKETS` can compile down to a simple `and` /// instruction. const NUM_BUCKETS: usize = 64; /// An implementation of the Rabin-Karp algorithm. The main idea of this /// algorithm is to maintain a rolling hash as it moves through the input, and /// then check whether that hash corresponds to the same hash for any of the /// patterns we're looking for. /// /// A draw back of naively scaling Rabin-Karp to multiple patterns is that /// it requires all of the patterns to be the same length, which in turn /// corresponds to the number of bytes to hash. We adapt this to work for /// multiple patterns of varying size by fixing the number of bytes to hash /// to be the length of the smallest pattern. We also split the patterns into /// several buckets to hopefully make the confirmation step faster. /// /// Wikipedia has a decent explanation, if a bit heavy on the theory: /// https://en.wikipedia.org/wiki/Rabin%E2%80%93Karp_algorithm /// /// But ESMAJ provides something a bit more concrete: /// http://www-igm.univ-mlv.fr/~lecroq/string/node5.html #[derive(Clone, Debug)] pub struct RabinKarp { /// The order of patterns in each bucket is significant. Namely, they are /// arranged such that the first one to match is the correct match. This /// may not necessarily correspond to the order provided by the caller. /// For example, if leftmost-longest semantics are used, then the patterns /// are sorted by their length in descending order. If leftmost-first /// semantics are used, then the patterns are sorted by their pattern ID /// in ascending order (which corresponds to the caller's order). buckets: Vec>, /// The length of the hashing window. Generally, this corresponds to the /// length of the smallest pattern. hash_len: usize, /// The factor to subtract out of a hash before updating it with a new /// byte. hash_2pow: usize, /// The maximum identifier of a pattern. This is used as a sanity check /// to ensure that the patterns provided by the caller are the same as /// the patterns that were used to compile the matcher. This sanity check /// possibly permits safely eliminating bounds checks regardless of what /// patterns are provided by the caller. /// /// (Currently, we don't use this to elide bounds checks since it doesn't /// result in a measurable performance improvement, but we do use it for /// better failure modes.) max_pattern_id: PatternID, } impl RabinKarp { /// Compile a new Rabin-Karp matcher from the patterns given. /// /// This panics if any of the patterns in the collection are empty, or if /// the collection is itself empty. pub fn new(patterns: &Patterns) -> RabinKarp { assert!(patterns.len() >= 1); let hash_len = patterns.minimum_len(); assert!(hash_len >= 1); let mut hash_2pow = 1usize; for _ in 1..hash_len { hash_2pow = hash_2pow.wrapping_shl(1); } let mut rk = RabinKarp { buckets: vec![vec![]; NUM_BUCKETS], hash_len, hash_2pow, max_pattern_id: patterns.max_pattern_id(), }; for (id, pat) in patterns.iter() { let hash = rk.hash(&pat.bytes()[..rk.hash_len]); let bucket = hash % NUM_BUCKETS; rk.buckets[bucket].push((hash, id)); } rk } /// Return the first matching pattern in the given haystack, begining the /// search at `at`. pub fn find_at( &self, patterns: &Patterns, haystack: &[u8], mut at: usize, ) -> Option { assert_eq!(NUM_BUCKETS, self.buckets.len()); assert_eq!( self.max_pattern_id, patterns.max_pattern_id(), "Rabin-Karp must be called with same patterns it was built with", ); if at + self.hash_len > haystack.len() { return None; } let mut hash = self.hash(&haystack[at..at + self.hash_len]); loop { let bucket = &self.buckets[hash % NUM_BUCKETS]; for &(phash, pid) in bucket { if phash == hash { if let Some(c) = self.verify(patterns, pid, haystack, at) { return Some(c); } } } if at + self.hash_len >= haystack.len() { return None; } hash = self.update_hash( hash, haystack[at], haystack[at + self.hash_len], ); at += 1; } } /// Returns the approximate total amount of heap used by this searcher, in /// units of bytes. pub fn heap_bytes(&self) -> usize { let num_patterns = self.max_pattern_id as usize + 1; self.buckets.len() * mem::size_of::>() + num_patterns * mem::size_of::<(Hash, PatternID)>() } /// Verify whether the pattern with the given id matches at /// `haystack[at..]`. /// /// We tag this function as `cold` because it helps improve codegen. /// Intuitively, it would seem like inlining it would be better. However, /// the only time this is called and a match is not found is when there /// there is a hash collision, or when a prefix of a pattern matches but /// the entire pattern doesn't match. This is hopefully fairly rare, and /// if it does occur a lot, it's going to be slow no matter what we do. #[cold] fn verify( &self, patterns: &Patterns, id: PatternID, haystack: &[u8], at: usize, ) -> Option { let pat = patterns.get(id); if pat.is_prefix(&haystack[at..]) { Some(Match::from_span(id as usize, at, at + pat.len())) } else { None } } /// Hash the given bytes. fn hash(&self, bytes: &[u8]) -> Hash { assert_eq!(self.hash_len, bytes.len()); let mut hash = 0usize; for &b in bytes { hash = hash.wrapping_shl(1).wrapping_add(b as usize); } hash } /// Update the hash given based on removing `old_byte` at the beginning /// of some byte string, and appending `new_byte` to the end of that same /// byte string. fn update_hash(&self, prev: Hash, old_byte: u8, new_byte: u8) -> Hash { prev.wrapping_sub((old_byte as usize).wrapping_mul(self.hash_2pow)) .wrapping_shl(1) .wrapping_add(new_byte as usize) } } aho-corasick-0.7.8/src/packed/teddy/README.md010064400017500000144000000453601352131022200167130ustar0000000000000000Teddy is a simd accelerated multiple substring matching algorithm. The name and the core ideas in the algorithm were learned from the [Hyperscan][1_u] project. The implementation in this repository was mostly motivated for use in accelerating regex searches by searching for small sets of required literals extracted from the regex. # Background The key idea of Teddy is to do *packed* substring matching. In the literature, packed substring matching is the idea of examining multiple bytes in a haystack at a time to detect matches. Implementations of, for example, memchr (which detects matches of a single byte) have been doing this for years. Only recently, with the introduction of various SIMD instructions, has this been extended to substring matching. The PCMPESTRI instruction (and its relatives), for example, implements substring matching in hardware. It is, however, limited to substrings of length 16 bytes or fewer, but this restriction is fine in a regex engine, since we rarely care about the performance difference between searching for a 16 byte literal and a 16 + N literal; 16 is already long enough. The key downside of the PCMPESTRI instruction, on current (2016) CPUs at least, is its latency and throughput. As a result, it is often faster to do substring search with a Boyer-Moore (or Two-Way) variant and a well placed memchr to quickly skip through the haystack. There are fewer results from the literature on packed substring matching, and even fewer for packed multiple substring matching. Ben-Kiki et al. [2] describes use of PCMPESTRI for substring matching, but is mostly theoretical and hand-waves performance. There is other theoretical work done by Bille [3] as well. The rest of the work in the field, as far as I'm aware, is by Faro and Kulekci and is generally focused on multiple pattern search. Their first paper [4a] introduces the concept of a fingerprint, which is computed for every block of N bytes in every pattern. The haystack is then scanned N bytes at a time and a fingerprint is computed in the same way it was computed for blocks in the patterns. If the fingerprint corresponds to one that was found in a pattern, then a verification step follows to confirm that one of the substrings with the corresponding fingerprint actually matches at the current location. Various implementation tricks are employed to make sure the fingerprint lookup is fast; typically by truncating the fingerprint. (This may, of course, provoke more steps in the verification process, so a balance must be struck.) The main downside of [4a] is that the minimum substring length is 32 bytes, presumably because of how the algorithm uses certain SIMD instructions. This essentially makes it useless for general purpose regex matching, where a small number of short patterns is far more likely. Faro and Kulekci published another paper [4b] that is conceptually very similar to [4a]. The key difference is that it uses the CRC32 instruction (introduced as part of SSE 4.2) to compute fingerprint values. This also enables the algorithm to work effectively on substrings as short as 7 bytes with 4 byte windows. 7 bytes is unfortunately still too long. The window could be technically shrunk to 2 bytes, thereby reducing minimum length to 3, but the small window size ends up negating most performance benefits—and it's likely the common case in a general purpose regex engine. Faro and Kulekci also published [4c] that appears to be intended as a replacement to using PCMPESTRI. In particular, it is specifically motivated by the high throughput/latency time of PCMPESTRI and therefore chooses other SIMD instructions that are faster. While this approach works for short substrings, I personally couldn't see a way to generalize it to multiple substring search. Faro and Kulekci have another paper [4d] that I haven't been able to read because it is behind a paywall. # Teddy Finally, we get to Teddy. If the above literature review is complete, then it appears that Teddy is a novel algorithm. More than that, in my experience, it completely blows away the competition for short substrings, which is exactly what we want in a general purpose regex engine. Again, the algorithm appears to be developed by the authors of [Hyperscan][1_u]. Hyperscan was open sourced late 2015, and no earlier history could be found. Therefore, tracking the exact provenance of the algorithm with respect to the published literature seems difficult. At a high level, Teddy works somewhat similarly to the fingerprint algorithms published by Faro and Kulekci, but Teddy does it in a way that scales a bit better. Namely: 1. Teddy's core algorithm scans the haystack in 16 (for SSE, or 32 for AVX) byte chunks. 16 (or 32) is significant because it corresponds to the number of bytes in a SIMD vector. 2. Bitwise operations are performed on each chunk to discover if any region of it matches a set of precomputed fingerprints from the patterns. If there are matches, then a verification step is performed. In this implementation, our verification step is naive. This can be improved upon. The details to make this work are quite clever. First, we must choose how to pick our fingerprints. In Hyperscan's implementation, I *believe* they use the last N bytes of each substring, where N must be at least the minimum length of any substring in the set being searched. In this implementation, we use the first N bytes of each substring. (The tradeoffs between these choices aren't yet clear to me.) We then must figure out how to quickly test whether an occurrence of any fingerprint from the set of patterns appears in a 16 byte block from the haystack. To keep things simple, let's assume N = 1 and examine some examples to motivate the approach. Here are our patterns: ```ignore foo bar baz ``` The corresponding fingerprints, for N = 1, are `f`, `b` and `b`. Now let's set our 16 byte block to: ```ignore bat cat foo bump xxxxxxxxxxxxxxxx ``` To cut to the chase, Teddy works by using bitsets. In particular, Teddy creates a mask that allows us to quickly compute membership of a fingerprint in a 16 byte block that also tells which pattern the fingerprint corresponds to. In this case, our fingerprint is a single byte, so an appropriate abstraction is a map from a single byte to a list of patterns that contain that fingerprint: ```ignore f |--> foo b |--> bar, baz ``` Now, all we need to do is figure out how to represent this map in vector space and use normal SIMD operations to perform a lookup. The first simplification we can make is to represent our patterns as bit fields occupying a single byte. This is important, because a single SIMD vector can store 16 bytes. ```ignore f |--> 00000001 b |--> 00000010, 00000100 ``` How do we perform lookup though? It turns out that SSSE3 introduced a very cool instruction called PSHUFB. The instruction takes two SIMD vectors, `A` and `B`, and returns a third vector `C`. All vectors are treated as 16 8-bit integers. `C` is formed by `C[i] = A[B[i]]`. (This is a bit of a simplification, but true for the purposes of this algorithm. For full details, see [Intel's Intrinsics Guide][5_u].) This essentially lets us use the values in `B` to lookup values in `A`. If we could somehow cause `B` to contain our 16 byte block from the haystack, and if `A` could contain our bitmasks, then we'd end up with something like this for `A`: ```ignore 0x00 0x01 ... 0x62 ... 0x66 ... 0xFF A = 0 0 00000110 00000001 0 ``` And if `B` contains our window from our haystack, we could use shuffle to take the values from `B` and use them to look up our bitsets in `A`. But of course, we can't do this because `A` in the above example contains 256 bytes, which is much larger than the size of a SIMD vector. Nybbles to the rescue! A nybble is 4 bits. Instead of one mask to hold all of our bitsets, we can use two masks, where one mask corresponds to the lower four bits of our fingerprint and the other mask corresponds to the upper four bits. So our map now looks like: ```ignore 'f' & 0xF = 0x6 |--> 00000001 'f' >> 4 = 0x6 |--> 00000111 'b' & 0xF = 0x2 |--> 00000110 'b' >> 4 = 0x6 |--> 00000111 ``` Notice that the bitsets for each nybble correspond to the union of all fingerprints that contain that nybble. For example, both `f` and `b` have the same upper 4 bits but differ on the lower 4 bits. Putting this together, we have `A0`, `A1` and `B`, where `A0` is our mask for the lower nybble, `A1` is our mask for the upper nybble and `B` is our 16 byte block from the haystack: ```ignore 0x00 0x01 0x02 0x03 ... 0x06 ... 0xF A0 = 0 0 00000110 0 00000001 0 A1 = 0 0 0 0 00000111 0 B = b a t _ t p B = 0x62 0x61 0x74 0x20 0x74 0x70 ``` But of course, we can't use `B` with `PSHUFB` yet, since its values are 8 bits, and we need indexes that are at most 4 bits (corresponding to one of 16 values). We can apply the same transformation to split `B` into lower and upper nybbles as we did `A`. As before, `B0` corresponds to the lower nybbles and `B1` corresponds to the upper nybbles: ```ignore b a t _ c a t _ f o o _ b u m p B0 = 0x2 0x1 0x4 0x0 0x3 0x1 0x4 0x0 0x6 0xF 0xF 0x0 0x2 0x5 0xD 0x0 B1 = 0x6 0x6 0x7 0x2 0x6 0x6 0x7 0x2 0x6 0x6 0x6 0x2 0x6 0x7 0x6 0x7 ``` And now we have a nice correspondence. `B0` can index `A0` and `B1` can index `A1`. Here's what we get when we apply `C0 = PSHUFB(A0, B0)`: ```ignore b a ... f o ... p A0[0x2] A0[0x1] A0[0x6] A0[0xF] A0[0x0] C0 = 00000110 0 00000001 0 0 ``` And `C1 = PSHUFB(A1, B1)`: ```ignore b a ... f o ... p A1[0x6] A1[0x6] A1[0x6] A1[0x6] A1[0x7] C1 = 00000111 00000111 00000111 00000111 0 ``` Notice how neither one of `C0` or `C1` is guaranteed to report fully correct results all on its own. For example, `C1` claims that `b` is a fingerprint for the pattern `foo` (since `A1[0x6] = 00000111`), and that `o` is a fingerprint for all of our patterns. But if we combined `C0` and `C1` with an `AND` operation: ```ignore b a ... f o ... p C = 00000110 0 00000001 0 0 ``` Then we now have that `C[i]` contains a bitset corresponding to the matching fingerprints in a haystack's 16 byte block, where `i` is the `ith` byte in that block. Once we have that, we can look for the position of the least significant bit in `C`. (Least significant because we only target `x86_64` here, which is always little endian. Thus, the least significant bytes correspond to bytes in our haystack at a lower address.) That position, modulo `8`, gives us the pattern that the fingerprint matches. That position, integer divided by `8`, also gives us the byte offset that the fingerprint occurs in inside the 16 byte haystack block. Using those two pieces of information, we can run a verification procedure that tries to match all substrings containing that fingerprint at that position in the haystack. # Implementation notes The problem with the algorithm as described above is that it uses a single byte for a fingerprint. This will work well if the fingerprints are rare in the haystack (e.g., capital letters or special characters in normal English text), but if the fingerprints are common, you'll wind up spending too much time in the verification step, which effectively negates the performance benefits of scanning 16 bytes at a time. Remember, the key to the performance of this algorithm is to do as little work as possible per 16 (or 32) bytes. This algorithm can be extrapolated in a relatively straight-forward way to use larger fingerprints. That is, instead of a single byte prefix, we might use a two or three byte prefix. The implementation here implements N = {1, 2, 3} and always picks the largest N possible. The rationale is that the bigger the fingerprint, the fewer verification steps we'll do. Of course, if N is too large, then we'll end up doing too much on each step. The way to extend it is: 1. Add a mask for each byte in the fingerprint. (Remember that each mask is composed of two SIMD vectors.) This results in a value of `C` for each byte in the fingerprint while searching. 2. When testing each 16 (or 32) byte block, each value of `C` must be shifted so that they are aligned. Once aligned, they should all be `AND`'d together. This will give you only the bitsets corresponding to the full match of the fingerprint. To do this, one needs to save the last byte (for N=2) or last two bytes (for N=3) from the previous iteration, and then line them up with the first one or two bytes of the next iteration. ## Verification Verification generally follows the procedure outlined above. The tricky parts are in the right formulation of operations to get our bits out of our vectors. We have a limited set of operations available to us on SIMD vectors as 128-bit or 256-bit numbers, so we wind up needing to rip out 2 (or 4) 64-bit integers from our vectors, and then run our verification step on each of those. The verification step looks at the least significant bit set, and from its position, we can derive the byte offset and bucket. (Again, as described above.) Once we know the bucket, we do a fairly naive exhaustive search for every literal in that bucket. (Hyperscan is a bit smarter here and uses a hash table, but I haven't had time to thoroughly explore that. A few initial half-hearted attempts resulted in worse performance.) ## AVX The AVX version of Teddy extrapolates almost perfectly from the SSE version. The only hickup is that PALIGNR is used to align chunks in the 16-bit version, and there is no equivalent instruction in AVX. AVX does have VPALIGNR, but it only works within 128-bit lanes. So there's a bit of tomfoolery to get around this by shuffling the vectors before calling VPALIGNR. The only other aspect to AVX is that since our masks are still fundamentally 16-bytes (0x0-0xF), they are duplicated to 32-bytes, so that they can apply to 32-byte chunks. ## Fat Teddy In the version of Teddy described above, 8 buckets are used to group patterns that we want to search for. However, when AVX is available, we can extend the number of buckets to 16 by permitting each byte in our masks to use 16-bits instead of 8-bits to represent the buckets it belongs to. (This variant is also in Hyperscan.) However, what we give up is the ability to scan 32 bytes at a time, even though we're using AVX. Instead, we have to scan 16 bytes at a time. What we gain, though, is (hopefully) less work in our verification routine. It patterns are more spread out across more buckets, then there should overall be fewer false positives. In general, Fat Teddy permits us to grow our capacity a bit and search for more literals before Teddy gets overwhelmed. The tricky part of Fat Teddy is in how we adjust our masks and our verification procedure. For the masks, we simply represent the first 8 buckets in each of the low 16 bytes, and then the second 8 buckets in each of the high 16 bytes. Then, in the search loop, instead of loading 32 bytes from the haystack, we load the same 16 bytes from the haystack into both the low and high 16 byte portions of our 256-bit vector. So for example, a mask might look like this: bits: 00100001 00000000 ... 11000000 00000000 00000001 ... 00000000 byte: 31 30 16 15 14 0 offset: 15 14 0 15 14 0 buckets: 8-15 8-15 8-15 0-7 0-7 0-7 Where `byte` is the position in the vector (higher numbers corresponding to more significant bits), `offset` is the corresponding position in the haystack chunk, and `buckets` corresponds to the bucket assignments for that particular byte. In particular, notice that the bucket assignments for offset `0` are spread out between bytes `0` and `16`. This works well for the chunk-by-chunk search procedure, but verification really wants to process all bucket assignments for each offset at once. Otherwise, we might wind up finding a match at offset `1` in one the first 8 buckets, when we really should have reported a match at offset `0` in one of the second 8 buckets. (Because we want the leftmost match.) Thus, for verification, we rearrange the above vector such that it is a sequence of 16-bit integers, where the least significant 16-bit integer corresponds to all of the bucket assignments for offset `0`. So with the above vector, the least significant 16-bit integer would be 11000000 000000 which was taken from bytes `16` and `0`. Then the verification step pretty much runs as described, except with 16 buckets instead of 8. # References - **[1]** [Hyperscan on GitHub](https://github.com/01org/hyperscan), [webpage](https://01.org/hyperscan) - **[2a]** Ben-Kiki, O., Bille, P., Breslauer, D., Gasieniec, L., Grossi, R., & Weimann, O. (2011). _Optimal packed string matching_. In LIPIcs-Leibniz International Proceedings in Informatics (Vol. 13). Schloss Dagstuhl-Leibniz-Zentrum fuer Informatik. DOI: 10.4230/LIPIcs.FSTTCS.2011.423. [PDF](http://drops.dagstuhl.de/opus/volltexte/2011/3355/pdf/37.pdf). - **[2b]** Ben-Kiki, O., Bille, P., Breslauer, D., Ga̧sieniec, L., Grossi, R., & Weimann, O. (2014). _Towards optimal packed string matching_. Theoretical Computer Science, 525, 111-129. DOI: 10.1016/j.tcs.2013.06.013. [PDF](http://www.cs.haifa.ac.il/~oren/Publications/bpsm.pdf). - **[3]** Bille, P. (2011). _Fast searching in packed strings_. Journal of Discrete Algorithms, 9(1), 49-56. DOI: 10.1016/j.jda.2010.09.003. [PDF](http://www.sciencedirect.com/science/article/pii/S1570866710000353). - **[4a]** Faro, S., & Külekci, M. O. (2012, October). _Fast multiple string matching using streaming SIMD extensions technology_. In String Processing and Information Retrieval (pp. 217-228). Springer Berlin Heidelberg. DOI: 10.1007/978-3-642-34109-0_23. [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro32.pdf). - **[4b]** Faro, S., & Külekci, M. O. (2013, September). _Towards a Very Fast Multiple String Matching Algorithm for Short Patterns_. In Stringology (pp. 78-91). [PDF](http://www.dmi.unict.it/~faro/papers/conference/faro36.pdf). - **[4c]** Faro, S., & Külekci, M. O. (2013, January). _Fast packed string matching for short patterns_. In Proceedings of the Meeting on Algorithm Engineering & Expermiments (pp. 113-121). Society for Industrial and Applied Mathematics. [PDF](http://arxiv.org/pdf/1209.6449.pdf). - **[4d]** Faro, S., & Külekci, M. O. (2014). _Fast and flexible packed string matching_. Journal of Discrete Algorithms, 28, 61-72. DOI: 10.1016/j.jda.2014.07.003. [1_u]: https://github.com/01org/hyperscan [5_u]: https://software.intel.com/sites/landingpage/IntrinsicsGuide aho-corasick-0.7.8/src/packed/teddy/compile.rs010064400017500000144000000407011361064532000174350ustar0000000000000000// See the README in this directory for an explanation of the Teddy algorithm. use std::cmp; use std::collections::BTreeMap; use std::fmt; use packed::pattern::{PatternID, Patterns}; use packed::teddy::Teddy; /// A builder for constructing a Teddy matcher. /// /// The builder primarily permits fine grained configuration of the Teddy /// matcher. Most options are made only available for testing/benchmarking /// purposes. In reality, options are automatically determined by the nature /// and number of patterns given to the builder. #[derive(Clone, Debug)] pub struct Builder { /// When none, this is automatically determined. Otherwise, `false` means /// slim Teddy is used (8 buckets) and `true` means fat Teddy is used /// (16 buckets). Fat Teddy requires AVX2, so if that CPU feature isn't /// available and Fat Teddy was requested, no matcher will be built. fat: Option, /// When none, this is automatically determined. Otherwise, `false` means /// that 128-bit vectors will be used (up to SSSE3 instructions) where as /// `true` means that 256-bit vectors will be used. As with `fat`, if /// 256-bit vectors are requested and they aren't available, then a /// searcher will not be built. avx: Option, } impl Default for Builder { fn default() -> Builder { Builder::new() } } impl Builder { /// Create a new builder for configuring a Teddy matcher. pub fn new() -> Builder { Builder { fat: None, avx: None } } /// Build a matcher for the set of patterns given. If a matcher could not /// be built, then `None` is returned. /// /// Generally, a matcher isn't built if the necessary CPU features aren't /// available, an unsupported target or if the searcher is believed to be /// slower than standard techniques (i.e., if there are too many literals). pub fn build(&self, patterns: &Patterns) -> Option { self.build_imp(patterns) } /// Require the use of Fat (true) or Slim (false) Teddy. Fat Teddy uses /// 16 buckets where as Slim Teddy uses 8 buckets. More buckets are useful /// for a larger set of literals. /// /// `None` is the default, which results in an automatic selection based /// on the number of literals and available CPU features. pub fn fat(&mut self, yes: Option) -> &mut Builder { self.fat = yes; self } /// Request the use of 256-bit vectors (true) or 128-bit vectors (false). /// Generally, a larger vector size is better since it either permits /// matching more patterns or matching more bytes in the haystack at once. /// /// `None` is the default, which results in an automatic selection based on /// the number of literals and available CPU features. pub fn avx(&mut self, yes: Option) -> &mut Builder { self.avx = yes; self } fn build_imp(&self, patterns: &Patterns) -> Option { use packed::teddy::runtime; // Most of the logic here is just about selecting the optimal settings, // or perhaps even rejecting construction altogether. The choices // we have are: fat (avx only) or not, ssse3 or avx2, and how many // patterns we allow ourselves to search. Additionally, for testing // and benchmarking, we permit callers to try to "force" a setting, // and if the setting isn't allowed (e.g., forcing AVX when AVX isn't // available), then we bail and return nothing. if patterns.len() > 64 { return None; } let has_ssse3 = is_x86_feature_detected!("ssse3"); let has_avx = is_x86_feature_detected!("avx2"); let avx = if self.avx == Some(true) { if !has_avx { return None; } true } else if self.avx == Some(false) { if !has_ssse3 { return None; } false } else if !has_ssse3 && !has_avx { return None; } else { has_avx }; let fat = match self.fat { None => avx && patterns.len() > 32, Some(false) => false, Some(true) if !avx => return None, Some(true) => true, }; let mut compiler = Compiler::new(patterns, fat); compiler.compile(); let Compiler { buckets, masks, .. } = compiler; // SAFETY: It is required that the builder only produce Teddy matchers // that are allowed to run on the current CPU, since we later assume // that the presence of (for example) TeddySlim1Mask256 means it is // safe to call functions marked with the `avx2` target feature. match (masks.len(), avx, fat) { (1, false, _) => Some(Teddy { buckets: buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim1Mask128( runtime::TeddySlim1Mask128 { mask1: runtime::Mask128::new(masks[0]), }, ), }), (1, true, false) => Some(Teddy { buckets: buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim1Mask256( runtime::TeddySlim1Mask256 { mask1: runtime::Mask256::new(masks[0]), }, ), }), (1, true, true) => Some(Teddy { buckets: buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddyFat1Mask256( runtime::TeddyFat1Mask256 { mask1: runtime::Mask256::new(masks[0]), }, ), }), (2, false, _) => Some(Teddy { buckets: buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim2Mask128( runtime::TeddySlim2Mask128 { mask1: runtime::Mask128::new(masks[0]), mask2: runtime::Mask128::new(masks[1]), }, ), }), (2, true, false) => Some(Teddy { buckets: buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim2Mask256( runtime::TeddySlim2Mask256 { mask1: runtime::Mask256::new(masks[0]), mask2: runtime::Mask256::new(masks[1]), }, ), }), (2, true, true) => Some(Teddy { buckets: buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddyFat2Mask256( runtime::TeddyFat2Mask256 { mask1: runtime::Mask256::new(masks[0]), mask2: runtime::Mask256::new(masks[1]), }, ), }), (3, false, _) => Some(Teddy { buckets: buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim3Mask128( runtime::TeddySlim3Mask128 { mask1: runtime::Mask128::new(masks[0]), mask2: runtime::Mask128::new(masks[1]), mask3: runtime::Mask128::new(masks[2]), }, ), }), (3, true, false) => Some(Teddy { buckets: buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddySlim3Mask256( runtime::TeddySlim3Mask256 { mask1: runtime::Mask256::new(masks[0]), mask2: runtime::Mask256::new(masks[1]), mask3: runtime::Mask256::new(masks[2]), }, ), }), (3, true, true) => Some(Teddy { buckets: buckets, max_pattern_id: patterns.max_pattern_id(), exec: runtime::Exec::TeddyFat3Mask256( runtime::TeddyFat3Mask256 { mask1: runtime::Mask256::new(masks[0]), mask2: runtime::Mask256::new(masks[1]), mask3: runtime::Mask256::new(masks[2]), }, ), }), _ => unreachable!(), } } } /// A compiler is in charge of allocating patterns into buckets and generating /// the masks necessary for searching. #[derive(Clone)] struct Compiler<'p> { patterns: &'p Patterns, buckets: Vec>, masks: Vec, } impl<'p> Compiler<'p> { /// Create a new Teddy compiler for the given patterns. If `fat` is true, /// then 16 buckets will be used instead of 8. /// /// This panics if any of the patterns given are empty. fn new(patterns: &'p Patterns, fat: bool) -> Compiler<'p> { let mask_len = cmp::min(3, patterns.minimum_len()); assert!(1 <= mask_len && mask_len <= 3); Compiler { patterns, buckets: vec![vec![]; if fat { 16 } else { 8 }], masks: vec![Mask::default(); mask_len], } } /// Compile the patterns in this compiler into buckets and masks. fn compile(&mut self) { let mut lonibble_to_bucket: BTreeMap, usize> = BTreeMap::new(); for (id, pattern) in self.patterns.iter() { // We try to be slightly clever in how we assign patterns into // buckets. Generally speaking, we want patterns with the same // prefix to be in the same bucket, since it minimizes the amount // of time we spend churning through buckets in the verification // step. // // So we could assign patterns with the same N-prefix (where N // is the size of the mask, which is one of {1, 2, 3}) to the // same bucket. However, case insensitive searches are fairly // common, so we'd for example, ideally want to treat `abc` and // `ABC` as if they shared the same prefix. ASCII has the nice // property that the lower 4 bits of A and a are the same, so we // therefore group patterns with the same low-nybbe-N-prefix into // the same bucket. // // MOREOVER, this is actually necessary for correctness! In // particular, by grouping patterns with the same prefix into the // same bucket, we ensure that we preserve correct leftmost-first // and leftmost-longest match semantics. In addition to the fact // that `patterns.iter()` iterates in the correct order, this // guarantees that all possible ambiguous matches will occur in // the same bucket. The verification routine could be adjusted to // support correct leftmost match semantics regardless of bucket // allocation, but that results in a performance hit. It's much // nicer to be able to just stop as soon as a match is found. let lonybs = pattern.low_nybbles(self.masks.len()); if let Some(&bucket) = lonibble_to_bucket.get(&lonybs) { self.buckets[bucket].push(id); } else { // N.B. We assign buckets in reverse because it shouldn't have // any influence on performance, but it does make it harder to // get leftmost match semantics accidentally correct. let bucket = (self.buckets.len() - 1) - (id as usize % self.buckets.len()); self.buckets[bucket].push(id); lonibble_to_bucket.insert(lonybs, bucket); } } for (bucket_index, bucket) in self.buckets.iter().enumerate() { for &pat_id in bucket { let pat = self.patterns.get(pat_id); for (i, mask) in self.masks.iter_mut().enumerate() { if self.buckets.len() == 8 { mask.add_slim(bucket_index as u8, pat.bytes()[i]); } else { mask.add_fat(bucket_index as u8, pat.bytes()[i]); } } } } } } impl<'p> fmt::Debug for Compiler<'p> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut buckets = vec![vec![]; self.buckets.len()]; for (i, bucket) in self.buckets.iter().enumerate() { for &patid in bucket { buckets[i].push(self.patterns.get(patid)); } } f.debug_struct("Compiler") .field("buckets", &buckets) .field("masks", &self.masks) .finish() } } /// Mask represents the low and high nybble masks that will be used during /// search. Each mask is 32 bytes wide, although only the first 16 bytes are /// used for the SSSE3 runtime. /// /// Each byte in the mask corresponds to a 8-bit bitset, where bit `i` is set /// if and only if the corresponding nybble is in the ith bucket. The index of /// the byte (0-15, inclusive) corresponds to the nybble. /// /// Each mask is used as the target of a shuffle, where the indices for the /// shuffle are taken from the haystack. AND'ing the shuffles for both the /// low and high masks together also results in 8-bit bitsets, but where bit /// `i` is set if and only if the correspond *byte* is in the ith bucket. /// /// During compilation, masks are just arrays. But during search, these masks /// are represented as 128-bit or 256-bit vectors. /// /// (See the README is this directory for more details.) #[derive(Clone, Copy, Default)] pub struct Mask { lo: [u8; 32], hi: [u8; 32], } impl Mask { /// Update this mask by adding the given byte to the given bucket. The /// given bucket must be in the range 0-7. /// /// This is for "slim" Teddy, where there are only 8 buckets. fn add_slim(&mut self, bucket: u8, byte: u8) { assert!(bucket < 8); let byte_lo = (byte & 0xF) as usize; let byte_hi = ((byte >> 4) & 0xF) as usize; // When using 256-bit vectors, we need to set this bucket assignment in // the low and high 128-bit portions of the mask. This allows us to // process 32 bytes at a time. Namely, AVX2 shuffles operate on each // of the 128-bit lanes, rather than the full 256-bit vector at once. self.lo[byte_lo] |= 1 << bucket; self.lo[byte_lo + 16] |= 1 << bucket; self.hi[byte_hi] |= 1 << bucket; self.hi[byte_hi + 16] |= 1 << bucket; } /// Update this mask by adding the given byte to the given bucket. The /// given bucket must be in the range 0-15. /// /// This is for "fat" Teddy, where there are 16 buckets. fn add_fat(&mut self, bucket: u8, byte: u8) { assert!(bucket < 16); let byte_lo = (byte & 0xF) as usize; let byte_hi = ((byte >> 4) & 0xF) as usize; // Unlike slim teddy, fat teddy only works with AVX2. For fat teddy, // the high 128 bits of our mask correspond to buckets 8-15, while the // low 128 bits correspond to buckets 0-7. if bucket < 8 { self.lo[byte_lo] |= 1 << bucket; self.hi[byte_hi] |= 1 << bucket; } else { self.lo[byte_lo + 16] |= 1 << (bucket % 8); self.hi[byte_hi + 16] |= 1 << (bucket % 8); } } /// Return the low 128 bits of the low-nybble mask. pub fn lo128(&self) -> [u8; 16] { let mut tmp = [0; 16]; tmp.copy_from_slice(&self.lo[..16]); tmp } /// Return the full low-nybble mask. pub fn lo256(&self) -> [u8; 32] { self.lo } /// Return the low 128 bits of the high-nybble mask. pub fn hi128(&self) -> [u8; 16] { let mut tmp = [0; 16]; tmp.copy_from_slice(&self.hi[..16]); tmp } /// Return the full high-nybble mask. pub fn hi256(&self) -> [u8; 32] { self.hi } } impl fmt::Debug for Mask { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let (mut parts_lo, mut parts_hi) = (vec![], vec![]); for i in 0..32 { parts_lo.push(format!("{:02}: {:08b}", i, self.lo[i])); parts_hi.push(format!("{:02}: {:08b}", i, self.hi[i])); } f.debug_struct("Mask") .field("lo", &parts_lo) .field("hi", &parts_hi) .finish() } } aho-corasick-0.7.8/src/packed/teddy/mod.rs010064400017500000144000000024371352132431400165660ustar0000000000000000#[cfg(target_arch = "x86_64")] pub use packed::teddy::compile::Builder; #[cfg(not(target_arch = "x86_64"))] pub use packed::teddy::fallback::Builder; #[cfg(not(target_arch = "x86_64"))] pub use packed::teddy::fallback::Teddy; #[cfg(target_arch = "x86_64")] pub use packed::teddy::runtime::Teddy; #[cfg(target_arch = "x86_64")] mod compile; #[cfg(target_arch = "x86_64")] mod runtime; #[cfg(not(target_arch = "x86_64"))] mod fallback { use packed::pattern::Patterns; use Match; #[derive(Clone, Debug, Default)] pub struct Builder(()); impl Builder { pub fn new() -> Builder { Builder(()) } pub fn build(&self, _: &Patterns) -> Option { None } pub fn fat(&mut self, _: Option) -> &mut Builder { self } pub fn avx(&mut self, _: Option) -> &mut Builder { self } } #[derive(Clone, Debug)] pub struct Teddy(()); impl Teddy { pub fn find_at( &self, _: &Patterns, _: &[u8], _: usize, ) -> Option { None } pub fn minimum_len(&self) -> usize { 0 } pub fn heap_bytes(&self) -> usize { 0 } } } aho-corasick-0.7.8/src/packed/teddy/runtime.rs010064400017500000144000001255041352132431400174730ustar0000000000000000// See the README in this directory for an explanation of the Teddy algorithm. // It is strongly recommended to peruse the README before trying to grok this // code, as its use of SIMD is pretty opaque, although I tried to add comments // where appropriate. // // Moreover, while there is a lot of code in this file, most of it is // repeated variants of the same thing. Specifically, there are three Teddy // variants: Slim 128-bit Teddy (8 buckets), Slim 256-bit Teddy (8 buckets) // and Fat 256-bit Teddy (16 buckets). For each variant, there are three // implementations, corresponding to mask lengths of 1, 2 and 3. Bringing it to // a total of nine variants. Each one is structured roughly the same: // // while at <= len(haystack) - CHUNK_SIZE: // let candidate = find_candidate_in_chunk(haystack, at) // if not all zeroes(candidate): // if match = verify(haystack, at, candidate): // return match // // For the most part, this remains unchanged. The parts that vary are the // verification routine (for slim vs fat Teddy) and the candidate extraction // (based on the number of masks). // // In the code below, a "candidate" corresponds to a single vector with 8-bit // lanes. Each lane is itself an 8-bit bitset, where the ith bit is set in the // jth lane if and only if the byte occurring at position `j` is in the // bucket `i` (where the `j`th position is the position in the current window // of the haystack, which is always 16 or 32 bytes). Note to be careful here: // the ith bit and the jth lane correspond to the least significant bits of the // vector. So when visualizing how the current window of bytes is stored in a // vector, you often need to flip it around. For example, the text `abcd` in a // 4-byte vector would look like this: // // 01100100 01100011 01100010 01100001 // d c b a // // When the mask length is 1, then finding the candidate is pretty straight // forward: you just apply the shuffle indices (from the haystack window) to // the masks, and then AND them together, as described in the README. But for // masks of length 2 and 3, you need to keep a little state. Specifically, // you need to store the final 1 (for mask length 2) or 2 (for mask length 3) // bytes of the candidate for use when searching the next window. This is for // handling matches that span two windows. // // With respect to the repeated code, it would likely be possible to reduce // the number of copies of code below using polymorphism, but I find this // formulation clearer instead of needing to reason through generics. However, // I admit, there may be a simpler generic construction that I'm missing. // // All variants are fairly heavily tested in src/packed/tests.rs. use std::arch::x86_64::*; use std::mem; use packed::pattern::{PatternID, Patterns}; use packed::teddy::compile; use packed::vector::*; use Match; /// The Teddy runtime. /// /// A Teddy runtime can be used to quickly search for occurrences of one or /// more patterns. While it does not scale to an arbitrary number of patterns /// like Aho-Corasick, it does find occurrences for a small set of patterns /// much more quickly than Aho-Corasick. /// /// Teddy cannot run on small haystacks below a certain size, which is /// dependent on the type of matcher used. This size can be queried via the /// `minimum_len` method. Violating this will result in a panic. /// /// Finally, when callers use a Teddy runtime, they must provide precisely the /// patterns used to construct the Teddy matcher. Violating this will result /// in either a panic or incorrect results, but will never sacrifice memory /// safety. #[derive(Clone, Debug)] pub struct Teddy { /// The allocation of patterns in buckets. This only contains the IDs of /// patterns. In order to do full verification, callers must provide the /// actual patterns when using Teddy. pub buckets: Vec>, /// The maximum identifier of a pattern. This is used as a sanity check to /// ensure that the patterns provided by the caller are the same as the /// patterns that were used to compile the matcher. This sanity check /// permits safely eliminating bounds checks regardless of what patterns /// are provided by the caller. /// /// Note that users of the aho-corasick crate cannot get this wrong. Only /// code internal to this crate can get it wrong, since neither `Patterns` /// type nor the Teddy runtime are public API items. pub max_pattern_id: PatternID, /// The actual runtime to use. pub exec: Exec, } impl Teddy { /// Return the first occurrence of a match in the given haystack after or /// starting at `at`. /// /// The patterns provided must be precisely the same patterns given to the /// Teddy builder, otherwise this may panic or produce incorrect results. /// /// All matches are consistent with the match semantics (leftmost-first or /// leftmost-longest) set on `pats`. pub fn find_at( &self, pats: &Patterns, haystack: &[u8], at: usize, ) -> Option { // This assert is a bit subtle, but it's an important guarantee. // Namely, if the maximum pattern ID seen by Teddy is the same as the // one in the patterns given, then we are guaranteed that every pattern // ID in all Teddy buckets are valid indices into `pats`. While this // is nominally true, there is no guarantee that callers provide the // same `pats` to both the Teddy builder and the searcher, which would // otherwise make `find_at` unsafe to call. But this assert lets us // keep this routine safe and eliminate an important bounds check in // verification. assert_eq!( self.max_pattern_id, pats.max_pattern_id(), "teddy must be called with same patterns it was built with", ); // SAFETY: The haystack must have at least a minimum number of bytes // for Teddy to be able to work. The minimum number varies depending on // which matcher is used below. If this is violated, then it's possible // for searching to do out-of-bounds writes. assert!(haystack[at..].len() >= self.minimum_len()); // SAFETY: The various Teddy matchers are always safe to call because // the Teddy builder guarantees that a particular Exec variant is // built only when it can be run the current CPU. That is, the Teddy // builder will not produce a Exec::TeddySlim1Mask256 unless AVX2 is // enabled. That is, our dynamic CPU feature detection is performed // once in the builder, and we rely on the type system to avoid needing // to do it again. unsafe { match self.exec { Exec::TeddySlim1Mask128(ref e) => { e.find_at(pats, self, haystack, at) } Exec::TeddySlim1Mask256(ref e) => { e.find_at(pats, self, haystack, at) } Exec::TeddyFat1Mask256(ref e) => { e.find_at(pats, self, haystack, at) } Exec::TeddySlim2Mask128(ref e) => { e.find_at(pats, self, haystack, at) } Exec::TeddySlim2Mask256(ref e) => { e.find_at(pats, self, haystack, at) } Exec::TeddyFat2Mask256(ref e) => { e.find_at(pats, self, haystack, at) } Exec::TeddySlim3Mask128(ref e) => { e.find_at(pats, self, haystack, at) } Exec::TeddySlim3Mask256(ref e) => { e.find_at(pats, self, haystack, at) } Exec::TeddyFat3Mask256(ref e) => { e.find_at(pats, self, haystack, at) } } } } /// Returns the minimum length of a haystack that must be provided by /// callers to this Teddy searcher. Providing a haystack shorter than this /// will result in a panic, but will never violate memory safety. pub fn minimum_len(&self) -> usize { // SAFETY: These values must be correct in order to ensure safety. // The Teddy runtime assumes their haystacks have at least these // lengths. Violating this will sacrifice memory safety. match self.exec { Exec::TeddySlim1Mask128(_) => 16, Exec::TeddySlim1Mask256(_) => 32, Exec::TeddyFat1Mask256(_) => 16, Exec::TeddySlim2Mask128(_) => 17, Exec::TeddySlim2Mask256(_) => 33, Exec::TeddyFat2Mask256(_) => 17, Exec::TeddySlim3Mask128(_) => 18, Exec::TeddySlim3Mask256(_) => 34, Exec::TeddyFat3Mask256(_) => 34, } } /// Returns the approximate total amount of heap used by this searcher, in /// units of bytes. pub fn heap_bytes(&self) -> usize { let num_patterns = self.max_pattern_id as usize + 1; self.buckets.len() * mem::size_of::>() + num_patterns * mem::size_of::() } /// Runs the verification routine for Slim 128-bit Teddy. /// /// The candidate given should be a collection of 8-bit bitsets (one bitset /// per lane), where the ith bit is set in the jth lane if and only if the /// byte occurring at `at + j` in `haystack` is in the bucket `i`. /// /// This is not safe to call unless the SSSE3 target feature is enabled. /// The `target_feature` attribute is not applied since this function is /// always forcefully inlined. #[inline(always)] unsafe fn verify128( &self, pats: &Patterns, haystack: &[u8], at: usize, cand: __m128i, ) -> Option { debug_assert!(!is_all_zeroes128(cand)); debug_assert_eq!(8, self.buckets.len()); // Convert the candidate into 64-bit chunks, and then verify each of // those chunks. let parts = unpack64x128(cand); for (i, &part) in parts.iter().enumerate() { let pos = at + i * 8; if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { return Some(m); } } None } /// Runs the verification routine for Slim 256-bit Teddy. /// /// The candidate given should be a collection of 8-bit bitsets (one bitset /// per lane), where the ith bit is set in the jth lane if and only if the /// byte occurring at `at + j` in `haystack` is in the bucket `i`. /// /// This is not safe to call unless the AVX2 target feature is enabled. /// The `target_feature` attribute is not applied since this function is /// always forcefully inlined. #[inline(always)] unsafe fn verify256( &self, pats: &Patterns, haystack: &[u8], at: usize, cand: __m256i, ) -> Option { debug_assert!(!is_all_zeroes256(cand)); debug_assert_eq!(8, self.buckets.len()); // Convert the candidate into 64-bit chunks, and then verify each of // those chunks. let parts = unpack64x256(cand); for (i, &part) in parts.iter().enumerate() { let pos = at + i * 8; if let Some(m) = self.verify64(pats, 8, haystack, pos, part) { return Some(m); } } None } /// Runs the verification routine for Fat 256-bit Teddy. /// /// The candidate given should be a collection of 8-bit bitsets (one bitset /// per lane), where the ith bit is set in the jth lane if and only if the /// byte occurring at `at + (j < 16 ? j : j - 16)` in `haystack` is in the /// bucket `j < 16 ? i : i + 8`. /// /// This is not safe to call unless the AVX2 target feature is enabled. /// The `target_feature` attribute is not applied since this function is /// always forcefully inlined. #[inline(always)] unsafe fn verify_fat256( &self, pats: &Patterns, haystack: &[u8], at: usize, cand: __m256i, ) -> Option { debug_assert!(!is_all_zeroes256(cand)); debug_assert_eq!(16, self.buckets.len()); // This is a bit tricky, but we basically want to convert our // candidate, which looks like this // // a31 a30 ... a17 a16 a15 a14 ... a01 a00 // // where each a(i) is an 8-bit bitset corresponding to the activated // buckets, to this // // a31 a15 a30 a14 a29 a13 ... a18 a02 a17 a01 a16 a00 // // Namely, for Fat Teddy, the high 128-bits of the candidate correspond // to the same bytes in the haystack in the low 128-bits (so we only // scan 16 bytes at a time), but are for buckets 8-15 instead of 0-7. // // The verification routine wants to look at all potentially matching // buckets before moving on to the next lane. So for example, both // a16 and a00 both correspond to the first byte in our window; a00 // contains buckets 0-7 and a16 contains buckets 8-15. Specifically, // a16 should be checked before a01. So the transformation shown above // allows us to use our normal verification procedure with one small // change: we treat each bitset as 16 bits instead of 8 bits. // Swap the 128-bit lanes in the candidate vector. let swap = _mm256_permute4x64_epi64(cand, 0x4E); // Interleave the bytes from the low 128-bit lanes, starting with // cand first. let r1 = _mm256_unpacklo_epi8(cand, swap); // Interleave the bytes from the high 128-bit lanes, starting with // cand first. let r2 = _mm256_unpackhi_epi8(cand, swap); // Now just take the 2 low 64-bit integers from both r1 and r2. We // can drop the high 64-bit integers because they are a mirror image // of the low 64-bit integers. All we care about are the low 128-bit // lanes of r1 and r2. Combined, they contain all our 16-bit bitsets // laid out in the desired order, as described above. let parts = unpacklo64x256(r1, r2); for (i, &part) in parts.iter().enumerate() { let pos = at + i * 4; if let Some(m) = self.verify64(pats, 16, haystack, pos, part) { return Some(m); } } None } /// Verify whether there are any matches starting at or after `at` in the /// given `haystack`. The candidate given should correspond to either 8-bit /// (for 8 buckets) or 16-bit (16 buckets) bitsets. #[inline(always)] fn verify64( &self, pats: &Patterns, bucket_count: usize, haystack: &[u8], at: usize, mut cand: u64, ) -> Option { // N.B. While the bucket count is known from self.buckets.len(), // requiring it as a parameter makes it easier for the optimizer to // know its value, and thus produce more efficient codegen. debug_assert!(bucket_count == 8 || bucket_count == 16); while cand != 0 { let bit = cand.trailing_zeros() as usize; cand &= !(1 << bit); let at = at + (bit / bucket_count); let bucket = bit % bucket_count; if let Some(m) = self.verify_bucket(pats, haystack, bucket, at) { return Some(m); } } None } /// Verify whether there are any matches starting at `at` in the given /// `haystack` corresponding only to patterns in the given bucket. #[inline(always)] fn verify_bucket( &self, pats: &Patterns, haystack: &[u8], bucket: usize, at: usize, ) -> Option { // Forcing this function to not inline and be "cold" seems to help // the codegen for Teddy overall. Interestingly, this is good for a // 16% boost in the sherlock/packed/teddy/name/alt1 benchmark (among // others). Overall, this seems like a problem with codegen, since // creating the Match itself is a very small amount of code. #[cold] #[inline(never)] fn match_from_span( pati: PatternID, start: usize, end: usize, ) -> Match { Match::from_span(pati as usize, start, end) } // N.B. The bounds check for this bucket lookup *should* be elided // since we assert the number of buckets in each `find_at` routine, // and the compiler can prove that the `% 8` (or `% 16`) in callers // of this routine will always be in bounds. for &pati in &self.buckets[bucket] { // SAFETY: This is safe because we are guaranteed that every // index in a Teddy bucket is a valid index into `pats`. This // guarantee is upheld by the assert checking `max_pattern_id` in // the beginning of `find_at` above. // // This explicit bounds check elision is (amazingly) good for a // 25-50% boost in some benchmarks, particularly ones with a lot // of short literals. let pat = unsafe { pats.get_unchecked(pati) }; if pat.is_prefix(&haystack[at..]) { return Some(match_from_span(pati, at, at + pat.len())); } } None } } /// Exec represents the different search strategies supported by the Teddy /// runtime. /// /// This enum is an important safety abstraction. Namely, callers should only /// construct a variant in this enum if it is safe to execute its corresponding /// target features on the current CPU. The 128-bit searchers require SSSE3, /// while the 256-bit searchers require AVX2. #[derive(Clone, Debug)] pub enum Exec { TeddySlim1Mask128(TeddySlim1Mask128), TeddySlim1Mask256(TeddySlim1Mask256), TeddyFat1Mask256(TeddyFat1Mask256), TeddySlim2Mask128(TeddySlim2Mask128), TeddySlim2Mask256(TeddySlim2Mask256), TeddyFat2Mask256(TeddyFat2Mask256), TeddySlim3Mask128(TeddySlim3Mask128), TeddySlim3Mask256(TeddySlim3Mask256), TeddyFat3Mask256(TeddyFat3Mask256), } // Most of the code below remains undocumented because they are effectively // repeated versions of themselves. The general structure is described in the // README and in the comments above. #[derive(Clone, Debug)] pub struct TeddySlim1Mask128 { pub mask1: Mask128, } impl TeddySlim1Mask128 { #[target_feature(enable = "ssse3")] unsafe fn find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(haystack[at..].len() >= teddy.minimum_len()); // This assert helps eliminate bounds checks for bucket lookups in // Teddy::verify_bucket, which has a small (3-4%) performance boost. assert_eq!(8, teddy.buckets.len()); let len = haystack.len(); while at <= len - 16 { let c = self.candidate(haystack, at); if !is_all_zeroes128(c) { if let Some(m) = teddy.verify128(pats, haystack, at, c) { return Some(m); } } at += 16; } if at < len { at = len - 16; let c = self.candidate(haystack, at); if !is_all_zeroes128(c) { if let Some(m) = teddy.verify128(pats, haystack, at, c) { return Some(m); } } } None } #[inline(always)] unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m128i { debug_assert!(haystack[at..].len() >= 16); let chunk = loadu128(haystack, at); members1m128(chunk, self.mask1) } } #[derive(Clone, Debug)] pub struct TeddySlim1Mask256 { pub mask1: Mask256, } impl TeddySlim1Mask256 { #[target_feature(enable = "avx2")] unsafe fn find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(haystack[at..].len() >= teddy.minimum_len()); // This assert helps eliminate bounds checks for bucket lookups in // Teddy::verify_bucket, which has a small (3-4%) performance boost. assert_eq!(8, teddy.buckets.len()); let len = haystack.len(); while at <= len - 32 { let c = self.candidate(haystack, at); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify256(pats, haystack, at, c) { return Some(m); } } at += 32; } if at < len { at = len - 32; let c = self.candidate(haystack, at); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify256(pats, haystack, at, c) { return Some(m); } } } None } #[inline(always)] unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { debug_assert!(haystack[at..].len() >= 32); let chunk = loadu256(haystack, at); members1m256(chunk, self.mask1) } } #[derive(Clone, Debug)] pub struct TeddyFat1Mask256 { pub mask1: Mask256, } impl TeddyFat1Mask256 { #[target_feature(enable = "avx2")] unsafe fn find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(haystack[at..].len() >= teddy.minimum_len()); // This assert helps eliminate bounds checks for bucket lookups in // Teddy::verify_bucket, which has a small (3-4%) performance boost. assert_eq!(16, teddy.buckets.len()); let len = haystack.len(); while at <= len - 16 { let c = self.candidate(haystack, at); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { return Some(m); } } at += 16; } if at < len { at = len - 16; let c = self.candidate(haystack, at); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify_fat256(pats, haystack, at, c) { return Some(m); } } } None } #[inline(always)] unsafe fn candidate(&self, haystack: &[u8], at: usize) -> __m256i { debug_assert!(haystack[at..].len() >= 16); let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); members1m256(chunk, self.mask1) } } #[derive(Clone, Debug)] pub struct TeddySlim2Mask128 { pub mask1: Mask128, pub mask2: Mask128, } impl TeddySlim2Mask128 { #[target_feature(enable = "ssse3")] unsafe fn find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(haystack[at..].len() >= teddy.minimum_len()); // This assert helps eliminate bounds checks for bucket lookups in // Teddy::verify_bucket, which has a small (3-4%) performance boost. assert_eq!(8, teddy.buckets.len()); at += 1; let len = haystack.len(); let mut prev0 = ones128(); while at <= len - 16 { let c = self.candidate(haystack, at, &mut prev0); if !is_all_zeroes128(c) { if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { return Some(m); } } at += 16; } if at < len { at = len - 16; prev0 = ones128(); let c = self.candidate(haystack, at, &mut prev0); if !is_all_zeroes128(c) { if let Some(m) = teddy.verify128(pats, haystack, at - 1, c) { return Some(m); } } } None } #[inline(always)] unsafe fn candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m128i, ) -> __m128i { debug_assert!(haystack[at..].len() >= 16); let chunk = loadu128(haystack, at); let (res0, res1) = members2m128(chunk, self.mask1, self.mask2); let res0prev0 = _mm_alignr_epi8(res0, *prev0, 15); _mm_and_si128(res0prev0, res1) } } #[derive(Clone, Debug)] pub struct TeddySlim2Mask256 { pub mask1: Mask256, pub mask2: Mask256, } impl TeddySlim2Mask256 { #[target_feature(enable = "avx2")] unsafe fn find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(haystack[at..].len() >= teddy.minimum_len()); // This assert helps eliminate bounds checks for bucket lookups in // Teddy::verify_bucket, which has a small (3-4%) performance boost. assert_eq!(8, teddy.buckets.len()); at += 1; let len = haystack.len(); let mut prev0 = ones256(); while at <= len - 32 { let c = self.candidate(haystack, at, &mut prev0); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { return Some(m); } } at += 32; } if at < len { at = len - 32; prev0 = ones256(); let c = self.candidate(haystack, at, &mut prev0); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify256(pats, haystack, at - 1, c) { return Some(m); } } } None } #[inline(always)] unsafe fn candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m256i, ) -> __m256i { debug_assert!(haystack[at..].len() >= 32); let chunk = loadu256(haystack, at); let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); let res0prev0 = alignr256_15(res0, *prev0); let res = _mm256_and_si256(res0prev0, res1); *prev0 = res0; res } } #[derive(Clone, Debug)] pub struct TeddyFat2Mask256 { pub mask1: Mask256, pub mask2: Mask256, } impl TeddyFat2Mask256 { #[target_feature(enable = "avx2")] unsafe fn find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(haystack[at..].len() >= teddy.minimum_len()); // This assert helps eliminate bounds checks for bucket lookups in // Teddy::verify_bucket, which has a small (3-4%) performance boost. assert_eq!(16, teddy.buckets.len()); at += 1; let len = haystack.len(); let mut prev0 = ones256(); while at <= len - 16 { let c = self.candidate(haystack, at, &mut prev0); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) { return Some(m); } } at += 16; } if at < len { at = len - 16; prev0 = ones256(); let c = self.candidate(haystack, at, &mut prev0); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify_fat256(pats, haystack, at - 1, c) { return Some(m); } } } None } #[inline(always)] unsafe fn candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m256i, ) -> __m256i { debug_assert!(haystack[at..].len() >= 16); let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); let (res0, res1) = members2m256(chunk, self.mask1, self.mask2); let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 15); let res = _mm256_and_si256(res0prev0, res1); *prev0 = res0; res } } #[derive(Clone, Debug)] pub struct TeddySlim3Mask128 { pub mask1: Mask128, pub mask2: Mask128, pub mask3: Mask128, } impl TeddySlim3Mask128 { #[target_feature(enable = "ssse3")] unsafe fn find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(haystack[at..].len() >= teddy.minimum_len()); // This assert helps eliminate bounds checks for bucket lookups in // Teddy::verify_bucket, which has a small (3-4%) performance boost. assert_eq!(8, teddy.buckets.len()); at += 2; let len = haystack.len(); let (mut prev0, mut prev1) = (ones128(), ones128()); while at <= len - 16 { let c = self.candidate(haystack, at, &mut prev0, &mut prev1); if !is_all_zeroes128(c) { if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { return Some(m); } } at += 16; } if at < len { at = len - 16; prev0 = ones128(); prev1 = ones128(); let c = self.candidate(haystack, at, &mut prev0, &mut prev1); if !is_all_zeroes128(c) { if let Some(m) = teddy.verify128(pats, haystack, at - 2, c) { return Some(m); } } } None } #[inline(always)] unsafe fn candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m128i, prev1: &mut __m128i, ) -> __m128i { debug_assert!(haystack[at..].len() >= 16); let chunk = loadu128(haystack, at); let (res0, res1, res2) = members3m128(chunk, self.mask1, self.mask2, self.mask3); let res0prev0 = _mm_alignr_epi8(res0, *prev0, 14); let res1prev1 = _mm_alignr_epi8(res1, *prev1, 15); let res = _mm_and_si128(_mm_and_si128(res0prev0, res1prev1), res2); *prev0 = res0; *prev1 = res1; res } } #[derive(Clone, Debug)] pub struct TeddySlim3Mask256 { pub mask1: Mask256, pub mask2: Mask256, pub mask3: Mask256, } impl TeddySlim3Mask256 { #[target_feature(enable = "avx2")] unsafe fn find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(haystack[at..].len() >= teddy.minimum_len()); // This assert helps eliminate bounds checks for bucket lookups in // Teddy::verify_bucket, which has a small (3-4%) performance boost. assert_eq!(8, teddy.buckets.len()); at += 2; let len = haystack.len(); let (mut prev0, mut prev1) = (ones256(), ones256()); while at <= len - 32 { let c = self.candidate(haystack, at, &mut prev0, &mut prev1); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { return Some(m); } } at += 32; } if at < len { at = len - 32; prev0 = ones256(); prev1 = ones256(); let c = self.candidate(haystack, at, &mut prev0, &mut prev1); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify256(pats, haystack, at - 2, c) { return Some(m); } } } None } #[inline(always)] unsafe fn candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m256i, prev1: &mut __m256i, ) -> __m256i { debug_assert!(haystack[at..].len() >= 32); let chunk = loadu256(haystack, at); let (res0, res1, res2) = members3m256(chunk, self.mask1, self.mask2, self.mask3); let res0prev0 = alignr256_14(res0, *prev0); let res1prev1 = alignr256_15(res1, *prev1); let res = _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); *prev0 = res0; *prev1 = res1; res } } #[derive(Clone, Debug)] pub struct TeddyFat3Mask256 { pub mask1: Mask256, pub mask2: Mask256, pub mask3: Mask256, } impl TeddyFat3Mask256 { #[target_feature(enable = "avx2")] unsafe fn find_at( &self, pats: &Patterns, teddy: &Teddy, haystack: &[u8], mut at: usize, ) -> Option { debug_assert!(haystack[at..].len() >= teddy.minimum_len()); // This assert helps eliminate bounds checks for bucket lookups in // Teddy::verify_bucket, which has a small (3-4%) performance boost. assert_eq!(16, teddy.buckets.len()); at += 2; let len = haystack.len(); let (mut prev0, mut prev1) = (ones256(), ones256()); while at <= len - 16 { let c = self.candidate(haystack, at, &mut prev0, &mut prev1); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) { return Some(m); } } at += 16; } if at < len { at = len - 16; prev0 = ones256(); prev1 = ones256(); let c = self.candidate(haystack, at, &mut prev0, &mut prev1); if !is_all_zeroes256(c) { if let Some(m) = teddy.verify_fat256(pats, haystack, at - 2, c) { return Some(m); } } } None } #[inline(always)] unsafe fn candidate( &self, haystack: &[u8], at: usize, prev0: &mut __m256i, prev1: &mut __m256i, ) -> __m256i { debug_assert!(haystack[at..].len() >= 16); let chunk = _mm256_broadcastsi128_si256(loadu128(haystack, at)); let (res0, res1, res2) = members3m256(chunk, self.mask1, self.mask2, self.mask3); let res0prev0 = _mm256_alignr_epi8(res0, *prev0, 14); let res1prev1 = _mm256_alignr_epi8(res1, *prev1, 15); let res = _mm256_and_si256(_mm256_and_si256(res0prev0, res1prev1), res2); *prev0 = res0; *prev1 = res1; res } } /// A 128-bit mask for the low and high nybbles in a set of patterns. Each /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if /// the nybble `j` is in the bucket `i` at a particular position. #[derive(Clone, Copy, Debug)] pub struct Mask128 { lo: __m128i, hi: __m128i, } impl Mask128 { /// Create a new SIMD mask from the mask produced by the Teddy builder. pub fn new(mask: compile::Mask) -> Mask128 { // SAFETY: This is safe since [u8; 16] has the same representation // as __m128i. unsafe { Mask128 { lo: mem::transmute(mask.lo128()), hi: mem::transmute(mask.hi128()), } } } } /// A 256-bit mask for the low and high nybbles in a set of patterns. Each /// lane `j` corresponds to a bitset where the `i`th bit is set if and only if /// the nybble `j` is in the bucket `i` at a particular position. /// /// This is slightly tweaked dependending on whether Slim or Fat Teddy is being /// used. For Slim Teddy, the bitsets in the lower 128-bits are the same as /// the bitsets in the higher 128-bits, so that we can search 32 bytes at a /// time. (Remember, the nybbles in the haystack are used as indices into these /// masks, and 256-bit shuffles only operate on 128-bit lanes.) /// /// For Fat Teddy, the bitsets are not repeated, but instead, the high 128 /// bits correspond to buckets 8-15. So that a bitset `00100010` has buckets /// 1 and 5 set if it's in the lower 128 bits, but has buckets 9 and 13 set /// if it's in the higher 128 bits. #[derive(Clone, Copy, Debug)] pub struct Mask256 { lo: __m256i, hi: __m256i, } impl Mask256 { /// Create a new SIMD mask from the mask produced by the Teddy builder. pub fn new(mask: compile::Mask) -> Mask256 { // SAFETY: This is safe since [u8; 32] has the same representation // as __m256i. unsafe { Mask256 { lo: mem::transmute(mask.lo256()), hi: mem::transmute(mask.hi256()), } } } } // The "members" routines below are responsible for taking a chunk of bytes, // a number of nybble masks and returning the result of using the masks to // lookup bytes in the chunk. The results of the high and low nybble masks are // AND'ed together, such that each candidate returned is a vector, with byte // sized lanes, and where each lane is an 8-bit bitset corresponding to the // buckets that contain the corresponding byte. // // In the case of masks of length greater than 1, callers will need to keep // the results from the previous haystack's window, and then shift the vectors // so that they all line up. Then they can be AND'ed together. /// Return a candidate for Slim 128-bit Teddy, where `chunk` corresponds to a /// 16-byte window of the haystack (where the least significant byte /// corresponds to the start of the window), and `mask1` corresponds to a /// low/high mask for the first byte of all patterns that are being searched. #[target_feature(enable = "ssse3")] unsafe fn members1m128(chunk: __m128i, mask1: Mask128) -> __m128i { let lomask = _mm_set1_epi8(0xF); let hlo = _mm_and_si128(chunk, lomask); let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); _mm_and_si128( _mm_shuffle_epi8(mask1.lo, hlo), _mm_shuffle_epi8(mask1.hi, hhi), ) } /// Return a candidate for Slim 256-bit Teddy, where `chunk` corresponds to a /// 32-byte window of the haystack (where the least significant byte /// corresponds to the start of the window), and `mask1` corresponds to a /// low/high mask for the first byte of all patterns that are being searched. /// /// Note that this can also be used for Fat Teddy, where the high 128 bits in /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte /// window in the haystack. #[target_feature(enable = "avx2")] unsafe fn members1m256(chunk: __m256i, mask1: Mask256) -> __m256i { let lomask = _mm256_set1_epi8(0xF); let hlo = _mm256_and_si256(chunk, lomask); let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); _mm256_and_si256( _mm256_shuffle_epi8(mask1.lo, hlo), _mm256_shuffle_epi8(mask1.hi, hhi), ) } /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds /// to a 16-byte window of the haystack (where the least significant byte /// corresponds to the start of the window), and the masks correspond to a /// low/high mask for the first and second bytes of all patterns that are being /// searched. The vectors returned correspond to candidates for the first and /// second bytes in the patterns represented by the masks. #[target_feature(enable = "ssse3")] unsafe fn members2m128( chunk: __m128i, mask1: Mask128, mask2: Mask128, ) -> (__m128i, __m128i) { let lomask = _mm_set1_epi8(0xF); let hlo = _mm_and_si128(chunk, lomask); let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); let res0 = _mm_and_si128( _mm_shuffle_epi8(mask1.lo, hlo), _mm_shuffle_epi8(mask1.hi, hhi), ); let res1 = _mm_and_si128( _mm_shuffle_epi8(mask2.lo, hlo), _mm_shuffle_epi8(mask2.hi, hhi), ); (res0, res1) } /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds /// to a 32-byte window of the haystack (where the least significant byte /// corresponds to the start of the window), and the masks correspond to a /// low/high mask for the first and second bytes of all patterns that are being /// searched. The vectors returned correspond to candidates for the first and /// second bytes in the patterns represented by the masks. /// /// Note that this can also be used for Fat Teddy, where the high 128 bits in /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte /// window in the haystack. #[target_feature(enable = "avx2")] unsafe fn members2m256( chunk: __m256i, mask1: Mask256, mask2: Mask256, ) -> (__m256i, __m256i) { let lomask = _mm256_set1_epi8(0xF); let hlo = _mm256_and_si256(chunk, lomask); let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); let res0 = _mm256_and_si256( _mm256_shuffle_epi8(mask1.lo, hlo), _mm256_shuffle_epi8(mask1.hi, hhi), ); let res1 = _mm256_and_si256( _mm256_shuffle_epi8(mask2.lo, hlo), _mm256_shuffle_epi8(mask2.hi, hhi), ); (res0, res1) } /// Return candidates for Slim 128-bit Teddy, where `chunk` corresponds /// to a 16-byte window of the haystack (where the least significant byte /// corresponds to the start of the window), and the masks correspond to a /// low/high mask for the first, second and third bytes of all patterns that /// are being searched. The vectors returned correspond to candidates for the /// first, second and third bytes in the patterns represented by the masks. #[target_feature(enable = "ssse3")] unsafe fn members3m128( chunk: __m128i, mask1: Mask128, mask2: Mask128, mask3: Mask128, ) -> (__m128i, __m128i, __m128i) { let lomask = _mm_set1_epi8(0xF); let hlo = _mm_and_si128(chunk, lomask); let hhi = _mm_and_si128(_mm_srli_epi16(chunk, 4), lomask); let res0 = _mm_and_si128( _mm_shuffle_epi8(mask1.lo, hlo), _mm_shuffle_epi8(mask1.hi, hhi), ); let res1 = _mm_and_si128( _mm_shuffle_epi8(mask2.lo, hlo), _mm_shuffle_epi8(mask2.hi, hhi), ); let res2 = _mm_and_si128( _mm_shuffle_epi8(mask3.lo, hlo), _mm_shuffle_epi8(mask3.hi, hhi), ); (res0, res1, res2) } /// Return candidates for Slim 256-bit Teddy, where `chunk` corresponds /// to a 32-byte window of the haystack (where the least significant byte /// corresponds to the start of the window), and the masks correspond to a /// low/high mask for the first, second and third bytes of all patterns that /// are being searched. The vectors returned correspond to candidates for the /// first, second and third bytes in the patterns represented by the masks. /// /// Note that this can also be used for Fat Teddy, where the high 128 bits in /// `chunk` is the same as the low 128 bits, which corresponds to a 16 byte /// window in the haystack. #[target_feature(enable = "avx2")] unsafe fn members3m256( chunk: __m256i, mask1: Mask256, mask2: Mask256, mask3: Mask256, ) -> (__m256i, __m256i, __m256i) { let lomask = _mm256_set1_epi8(0xF); let hlo = _mm256_and_si256(chunk, lomask); let hhi = _mm256_and_si256(_mm256_srli_epi16(chunk, 4), lomask); let res0 = _mm256_and_si256( _mm256_shuffle_epi8(mask1.lo, hlo), _mm256_shuffle_epi8(mask1.hi, hhi), ); let res1 = _mm256_and_si256( _mm256_shuffle_epi8(mask2.lo, hlo), _mm256_shuffle_epi8(mask2.hi, hhi), ); let res2 = _mm256_and_si256( _mm256_shuffle_epi8(mask3.lo, hlo), _mm256_shuffle_epi8(mask3.hi, hhi), ); (res0, res1, res2) } aho-corasick-0.7.8/src/packed/tests.rs010064400017500000144000000433241352131022200160310ustar0000000000000000use std::collections::HashMap; use std::usize; use packed::{Config, MatchKind}; use Match; /// A description of a single test against a multi-pattern searcher. /// /// A single test may not necessarily pass on every configuration of a /// searcher. The tests are categorized and grouped appropriately below. #[derive(Clone, Debug, Eq, PartialEq)] struct SearchTest { /// The name of this test, for debugging. name: &'static str, /// The patterns to search for. patterns: &'static [&'static str], /// The text to search. haystack: &'static str, /// Each match is a triple of (pattern_index, start, end), where /// pattern_index is an index into `patterns` and `start`/`end` are indices /// into `haystack`. matches: &'static [(usize, usize, usize)], } struct SearchTestOwned { offset: usize, name: String, patterns: Vec, haystack: String, matches: Vec<(usize, usize, usize)>, } impl SearchTest { fn variations(&self) -> Vec { let mut tests = vec![]; for i in 0..=260 { tests.push(self.offset_prefix(i)); tests.push(self.offset_suffix(i)); tests.push(self.offset_both(i)); } tests } fn offset_both(&self, off: usize) -> SearchTestOwned { SearchTestOwned { offset: off, name: self.name.to_string(), patterns: self.patterns.iter().map(|s| s.to_string()).collect(), haystack: format!( "{}{}{}", "Z".repeat(off), self.haystack, "Z".repeat(off) ), matches: self .matches .iter() .map(|&(id, s, e)| (id, s + off, e + off)) .collect(), } } fn offset_prefix(&self, off: usize) -> SearchTestOwned { SearchTestOwned { offset: off, name: self.name.to_string(), patterns: self.patterns.iter().map(|s| s.to_string()).collect(), haystack: format!("{}{}", "Z".repeat(off), self.haystack), matches: self .matches .iter() .map(|&(id, s, e)| (id, s + off, e + off)) .collect(), } } fn offset_suffix(&self, off: usize) -> SearchTestOwned { SearchTestOwned { offset: off, name: self.name.to_string(), patterns: self.patterns.iter().map(|s| s.to_string()).collect(), haystack: format!("{}{}", self.haystack, "Z".repeat(off)), matches: self.matches.to_vec(), } } // fn to_owned(&self) -> SearchTestOwned { // SearchTestOwned { // name: self.name.to_string(), // patterns: self.patterns.iter().map(|s| s.to_string()).collect(), // haystack: self.haystack.to_string(), // matches: self.matches.iter().cloned().collect(), // } // } } /// Short-hand constructor for SearchTest. We use it a lot below. macro_rules! t { ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { SearchTest { name: stringify!($name), patterns: $patterns, haystack: $haystack, matches: $matches, } }; } /// A collection of test groups. type TestCollection = &'static [&'static [SearchTest]]; // Define several collections corresponding to the different type of match // semantics supported. These collections have some overlap, but each // collection should have some tests that no other collection has. /// Tests for leftmost-first match semantics. const PACKED_LEFTMOST_FIRST: TestCollection = &[BASICS, LEFTMOST, LEFTMOST_FIRST, REGRESSION, TEDDY]; /// Tests for leftmost-longest match semantics. const PACKED_LEFTMOST_LONGEST: TestCollection = &[BASICS, LEFTMOST, LEFTMOST_LONGEST, REGRESSION, TEDDY]; // Now define the individual tests that make up the collections above. /// A collection of tests for the that should always be true regardless of /// match semantics. That is, all combinations of leftmost-{first, longest} /// should produce the same answer. const BASICS: &'static [SearchTest] = &[ t!(basic001, &["a"], "", &[]), t!(basic010, &["a"], "a", &[(0, 0, 1)]), t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), t!(basic050, &["a"], "bba", &[(0, 2, 3)]), t!(basic060, &["a"], "bbb", &[]), t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), t!(basic100, &["aa"], "", &[]), t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), t!(basic130, &["aa"], "abbab", &[]), t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), t!(basic150, &["aaa"], "aaa", &[(0, 0, 3)]), t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), t!(basic300, &["a", "b"], "", &[]), t!(basic310, &["a", "b"], "z", &[]), t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), t!( basic340, &["a", "b"], "abba", &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] ), t!( basic350, &["b", "a"], "abba", &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] ), t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), t!(basic400, &["foo", "bar"], "", &[]), t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), t!( basic720, &["yabcdef", "bcdeyabc", "abcdezghi"], "yabcdezghi", &[(2, 1, 10),] ), t!(basic810, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), t!(basic820, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), t!(basic830, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), t!( basic840, &["ab", "ba"], "abababa", &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] ), t!(basic850, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), ]; /// Tests for leftmost match semantics. These should pass for both /// leftmost-first and leftmost-longest match kinds. Stated differently, among /// ambiguous matches, the longest match and the match that appeared first when /// constructing the automaton should always be the same. const LEFTMOST: &'static [SearchTest] = &[ t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), t!( leftmost360, &["abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(2, 0, 8),] ), t!( leftmost370, &["abcdefghi", "cde", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost380, &["abcdefghi", "hz", "abcdefgh", "a"], "abcdefghz", &[(2, 0, 8),] ), t!( leftmost390, &["b", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost400, &["h", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost410, &["z", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8), (0, 8, 9),] ), ]; /// Tests for non-overlapping leftmost-first match semantics. These tests /// should generally be specific to leftmost-first, which means they should /// generally fail under leftmost-longest semantics. const LEFTMOST_FIRST: &'static [SearchTest] = &[ t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), t!( leftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[(1, 1, 2), (3, 2, 4),] ), t!( leftfirst320, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(0, 0, 1), (2, 7, 9),] ), t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), t!( leftfirst340, &["abcdef", "x", "x", "x", "x", "x", "x", "abcde"], "abcdef", &[(0, 0, 6)] ), ]; /// Tests for non-overlapping leftmost-longest match semantics. These tests /// should generally be specific to leftmost-longest, which means they should /// generally fail under leftmost-first semantics. const LEFTMOST_LONGEST: &'static [SearchTest] = &[ t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), t!( leftlong310, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), ]; /// Regression tests that are applied to all combinations. /// /// If regression tests are needed for specific match semantics, then add them /// to the appropriate group above. const REGRESSION: &'static [SearchTest] = &[ t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), t!( regression030, &["libcore/", "libstd/"], "libcore/char/methods.rs", &[(0, 0, 8),] ), t!( regression040, &["libstd/", "libcore/"], "libcore/char/methods.rs", &[(1, 0, 8),] ), t!( regression050, &["\x00\x00\x01", "\x00\x00\x00"], "\x00\x00\x00", &[(1, 0, 3),] ), t!( regression060, &["\x00\x00\x00", "\x00\x00\x01"], "\x00\x00\x00", &[(0, 0, 3),] ), ]; const TEDDY: &'static [SearchTest] = &[ t!( teddy010, &["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"], "abcdefghijk", &[ (0, 0, 1), (1, 1, 2), (2, 2, 3), (3, 3, 4), (4, 4, 5), (5, 5, 6), (6, 6, 7), (7, 7, 8), (8, 8, 9), (9, 9, 10), (10, 10, 11) ] ), t!( teddy020, &["ab", "bc", "cd", "de", "ef", "fg", "gh", "hi", "ij", "jk", "kl"], "abcdefghijk", &[(0, 0, 2), (2, 2, 4), (4, 4, 6), (6, 6, 8), (8, 8, 10),] ), t!( teddy030, &["abc"], "abcdefghijklmnopqrstuvwxyzabcdefghijk", &[(0, 0, 3), (0, 26, 29)] ), ]; // Now define a test for each combination of things above that we want to run. // Since there are a few different combinations for each collection of tests, // we define a couple of macros to avoid repetition drudgery. The testconfig // macro constructs the automaton from a given match kind, and runs the search // tests one-by-one over the given collection. The `with` parameter allows one // to configure the config with additional parameters. The testcombo macro // invokes testconfig in precisely this way: it sets up several tests where // each one turns a different knob on Config. macro_rules! testconfig { ($name:ident, $collection:expr, $with:expr) => { #[test] fn $name() { run_search_tests($collection, |test| { let mut config = Config::new(); $with(&mut config); config .builder() .extend(test.patterns.iter().map(|p| p.as_bytes())) .build() .unwrap() .find_iter(&test.haystack) .collect() }); } }; } #[cfg(target_arch = "x86_64")] testconfig!( search_default_leftmost_first, PACKED_LEFTMOST_FIRST, |_: &mut Config| {} ); #[cfg(target_arch = "x86_64")] testconfig!( search_default_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.match_kind(MatchKind::LeftmostLongest); } ); #[cfg(target_arch = "x86_64")] testconfig!( search_teddy_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.force_teddy(true); } ); #[cfg(target_arch = "x86_64")] testconfig!( search_teddy_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); } ); #[cfg(target_arch = "x86_64")] testconfig!( search_teddy_ssse3_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.force_teddy(true); if is_x86_feature_detected!("ssse3") { c.force_avx(Some(false)); } } ); #[cfg(target_arch = "x86_64")] testconfig!( search_teddy_ssse3_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); if is_x86_feature_detected!("ssse3") { c.force_avx(Some(false)); } } ); #[cfg(target_arch = "x86_64")] testconfig!( search_teddy_avx2_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.force_teddy(true); if is_x86_feature_detected!("avx2") { c.force_avx(Some(true)); } } ); #[cfg(target_arch = "x86_64")] testconfig!( search_teddy_avx2_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); if is_x86_feature_detected!("avx2") { c.force_avx(Some(true)); } } ); #[cfg(target_arch = "x86_64")] testconfig!( search_teddy_fat_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.force_teddy(true); if is_x86_feature_detected!("avx2") { c.force_teddy_fat(Some(true)); } } ); #[cfg(target_arch = "x86_64")] testconfig!( search_teddy_fat_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.force_teddy(true).match_kind(MatchKind::LeftmostLongest); if is_x86_feature_detected!("avx2") { c.force_teddy_fat(Some(true)); } } ); testconfig!( search_rabinkarp_leftmost_first, PACKED_LEFTMOST_FIRST, |c: &mut Config| { c.force_rabin_karp(true); } ); testconfig!( search_rabinkarp_leftmost_longest, PACKED_LEFTMOST_LONGEST, |c: &mut Config| { c.force_rabin_karp(true).match_kind(MatchKind::LeftmostLongest); } ); #[test] fn search_tests_have_unique_names() { let assert = |constname, tests: &[SearchTest]| { let mut seen = HashMap::new(); // map from test name to position for (i, test) in tests.iter().enumerate() { if !seen.contains_key(test.name) { seen.insert(test.name, i); } else { let last = seen[test.name]; panic!( "{} tests have duplicate names at positions {} and {}", constname, last, i ); } } }; assert("BASICS", BASICS); assert("LEFTMOST", LEFTMOST); assert("LEFTMOST_FIRST", LEFTMOST_FIRST); assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); assert("REGRESSION", REGRESSION); assert("TEDDY", TEDDY); } fn run_search_tests Vec>( which: TestCollection, mut f: F, ) { let get_match_triples = |matches: Vec| -> Vec<(usize, usize, usize)> { matches .into_iter() .map(|m| (m.pattern(), m.start(), m.end())) .collect() }; for &tests in which { for spec in tests { for test in spec.variations() { assert_eq!( test.matches, get_match_triples(f(&test)).as_slice(), "test: {}, patterns: {:?}, haystack: {:?}, offset: {:?}", test.name, test.patterns, test.haystack, test.offset, ); } } } } aho-corasick-0.7.8/src/packed/vector.rs010064400017500000144000000163021352131022200161650ustar0000000000000000// This file contains a set of fairly generic utility functions when working // with SIMD vectors. // // SAFETY: All of the routines below are unsafe to call because they assume // the necessary CPU target features in order to use particular vendor // intrinsics. Calling these routines when the underlying CPU does not support // the appropriate target features is NOT safe. Callers must ensure this // themselves. // // Note that it may not look like this safety invariant is being upheld when // these routines are called. Namely, the CPU feature check is typically pretty // far away from when these routines are used. Instead, we rely on the fact // that certain types serve as a guaranteed receipt that pertinent target // features are enabled. For example, the only way TeddySlim3Mask256 can be // constructed is if the AVX2 CPU feature is available. Thus, any code running // inside of TeddySlim3Mask256 can use any of the functions below without any // additional checks: its very existence *is* the check. use std::arch::x86_64::*; /// Shift `a` to the left by two bytes (removing its two most significant /// bytes), and concatenate it with the the two most significant bytes of `b`. #[target_feature(enable = "avx2")] pub unsafe fn alignr256_14(a: __m256i, b: __m256i) -> __m256i { // Credit goes to jneem for figuring this out: // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 // // TL;DR avx2's PALIGNR instruction is actually just two 128-bit PALIGNR // instructions, which is not what we want, so we need to do some extra // shuffling. // This permute gives us the low 16 bytes of a concatenated with the high // 16 bytes of b, in order of most significant to least significant. So // `v = a[15:0] b[31:16]`. let v = _mm256_permute2x128_si256(b, a, 0x21); // This effectively does this (where we deal in terms of byte-indexing // and byte-shifting, and use inclusive ranges): // // ret[15:0] := ((a[15:0] << 16) | v[15:0]) >> 14 // = ((a[15:0] << 16) | b[31:16]) >> 14 // ret[31:16] := ((a[31:16] << 16) | v[31:16]) >> 14 // = ((a[31:16] << 16) | a[15:0]) >> 14 // // Which therefore results in: // // ret[31:0] := a[29:16] a[15:14] a[13:0] b[31:30] // // The end result is that we've effectively done this: // // (a << 2) | (b >> 30) // // When `A` and `B` are strings---where the beginning of the string is in // the least significant bits---we effectively result in the following // semantic operation: // // (A >> 2) | (B << 30) // // The reversal being attributed to the fact that we are in little-endian. _mm256_alignr_epi8(a, v, 14) } /// Shift `a` to the left by one byte (removing its most significant byte), and /// concatenate it with the the most significant byte of `b`. #[target_feature(enable = "avx2")] pub unsafe fn alignr256_15(a: __m256i, b: __m256i) -> __m256i { // For explanation, see alignr256_14. let v = _mm256_permute2x128_si256(b, a, 0x21); _mm256_alignr_epi8(a, v, 15) } /// Unpack the given 128-bit vector into its 64-bit components. The first /// element of the array returned corresponds to the least significant 64-bit /// lane in `a`. #[target_feature(enable = "ssse3")] pub unsafe fn unpack64x128(a: __m128i) -> [u64; 2] { [ _mm_cvtsi128_si64(a) as u64, _mm_cvtsi128_si64(_mm_srli_si128(a, 8)) as u64, ] } /// Unpack the given 256-bit vector into its 64-bit components. The first /// element of the array returned corresponds to the least significant 64-bit /// lane in `a`. #[target_feature(enable = "avx2")] pub unsafe fn unpack64x256(a: __m256i) -> [u64; 4] { // Using transmute here is precisely equivalent, but actually slower. It's // not quite clear why. let lo = _mm256_extracti128_si256(a, 0); let hi = _mm256_extracti128_si256(a, 1); [ _mm_cvtsi128_si64(lo) as u64, _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, _mm_cvtsi128_si64(hi) as u64, _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, ] } /// Unpack the low 128-bits of `a` and `b`, and return them as 4 64-bit /// integers. /// /// More precisely, if a = a4 a3 a2 a1 and b = b4 b3 b2 b1, where each element /// is a 64-bit integer and a1/b1 correspond to the least significant 64 bits, /// then the return value is `b2 b1 a2 a1`. #[target_feature(enable = "avx2")] pub unsafe fn unpacklo64x256(a: __m256i, b: __m256i) -> [u64; 4] { let lo = _mm256_castsi256_si128(a); let hi = _mm256_castsi256_si128(b); [ _mm_cvtsi128_si64(lo) as u64, _mm_cvtsi128_si64(_mm_srli_si128(lo, 8)) as u64, _mm_cvtsi128_si64(hi) as u64, _mm_cvtsi128_si64(_mm_srli_si128(hi, 8)) as u64, ] } /// Returns true if and only if all bits in the given 128-bit vector are 0. #[target_feature(enable = "ssse3")] pub unsafe fn is_all_zeroes128(a: __m128i) -> bool { let cmp = _mm_cmpeq_epi8(a, zeroes128()); _mm_movemask_epi8(cmp) as u32 == 0xFFFF } /// Returns true if and only if all bits in the given 256-bit vector are 0. #[target_feature(enable = "avx2")] pub unsafe fn is_all_zeroes256(a: __m256i) -> bool { let cmp = _mm256_cmpeq_epi8(a, zeroes256()); _mm256_movemask_epi8(cmp) as u32 == 0xFFFFFFFF } /// Load a 128-bit vector from slice at the given position. The slice does /// not need to be unaligned. /// /// Since this code assumes little-endian (there is no big-endian x86), the /// bytes starting in `slice[at..]` will be at the least significant bits of /// the returned vector. This is important for the surrounding code, since for /// example, shifting the resulting vector right is equivalent to logically /// shifting the bytes in `slice` left. #[target_feature(enable = "sse2")] pub unsafe fn loadu128(slice: &[u8], at: usize) -> __m128i { let ptr = slice.get_unchecked(at..).as_ptr(); _mm_loadu_si128(ptr as *const u8 as *const __m128i) } /// Load a 256-bit vector from slice at the given position. The slice does /// not need to be unaligned. /// /// Since this code assumes little-endian (there is no big-endian x86), the /// bytes starting in `slice[at..]` will be at the least significant bits of /// the returned vector. This is important for the surrounding code, since for /// example, shifting the resulting vector right is equivalent to logically /// shifting the bytes in `slice` left. #[target_feature(enable = "avx2")] pub unsafe fn loadu256(slice: &[u8], at: usize) -> __m256i { let ptr = slice.get_unchecked(at..).as_ptr(); _mm256_loadu_si256(ptr as *const u8 as *const __m256i) } /// Returns a 128-bit vector with all bits set to 0. #[target_feature(enable = "sse2")] pub unsafe fn zeroes128() -> __m128i { _mm_set1_epi8(0) } /// Returns a 256-bit vector with all bits set to 0. #[target_feature(enable = "avx2")] pub unsafe fn zeroes256() -> __m256i { _mm256_set1_epi8(0) } /// Returns a 128-bit vector with all bits set to 1. #[target_feature(enable = "sse2")] pub unsafe fn ones128() -> __m128i { _mm_set1_epi8(0xFF as u8 as i8) } /// Returns a 256-bit vector with all bits set to 1. #[target_feature(enable = "avx2")] pub unsafe fn ones256() -> __m256i { _mm256_set1_epi8(0xFF as u8 as i8) } aho-corasick-0.7.8/src/prefilter.rs010066400017500000144000000757031361064550300154600ustar0000000000000000use std::cmp; use std::fmt; use std::panic::{RefUnwindSafe, UnwindSafe}; use std::u8; use memchr::{memchr, memchr2, memchr3}; use ahocorasick::MatchKind; use packed; use Match; /// A candidate is the result of running a prefilter on a haystack at a /// particular position. The result is either no match, a confirmed match or /// a possible match. /// /// When no match is returned, the prefilter is guaranteeing that no possible /// match can be found in the haystack, and the caller may trust this. That is, /// all correct prefilters must never report false negatives. /// /// In some cases, a prefilter can confirm a match very quickly, in which case, /// the caller may use this to stop what it's doing and report the match. In /// this case, prefilter implementations must never report a false positive. /// In other cases, the prefilter can only report a potential match, in which /// case the callers must attempt to confirm the match. In this case, prefilter /// implementations are permitted to return false positives. #[derive(Clone, Debug)] pub enum Candidate { None, Match(Match), PossibleStartOfMatch(usize), } impl Candidate { /// Convert this candidate into an option. This is useful when callers /// do not distinguish between true positives and false positives (i.e., /// the caller must always confirm the match in order to update some other /// state). pub fn into_option(self) -> Option { match self { Candidate::None => None, Candidate::Match(ref m) => Some(m.start()), Candidate::PossibleStartOfMatch(start) => Some(start), } } } /// A prefilter describes the behavior of fast literal scanners for quickly /// skipping past bytes in the haystack that we know cannot possibly /// participate in a match. pub trait Prefilter: Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug { /// Returns the next possible match candidate. This may yield false /// positives, so callers must confirm a match starting at the position /// returned. This, however, must never produce false negatives. That is, /// this must, at minimum, return the starting position of the next match /// in the given haystack after or at the given position. fn next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate; /// A method for cloning a prefilter, to work-around the fact that Clone /// is not object-safe. fn clone_prefilter(&self) -> Box; /// Returns the approximate total amount of heap used by this prefilter, in /// units of bytes. fn heap_bytes(&self) -> usize; /// Returns true if and only if this prefilter never returns false /// positives. This is useful for completely avoiding the automaton /// when the prefilter can quickly confirm its own matches. /// /// By default, this returns true, which is conservative; it is always /// correct to return `true`. Returning `false` here and reporting a false /// positive will result in incorrect searches. fn reports_false_positives(&self) -> bool { true } } impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P { #[inline] fn next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate { (**self).next_candidate(state, haystack, at) } fn clone_prefilter(&self) -> Box { (**self).clone_prefilter() } fn heap_bytes(&self) -> usize { (**self).heap_bytes() } fn reports_false_positives(&self) -> bool { (**self).reports_false_positives() } } /// A convenience object for representing any type that implements Prefilter /// and is cloneable. #[derive(Debug)] pub struct PrefilterObj(Box); impl Clone for PrefilterObj { fn clone(&self) -> Self { PrefilterObj(self.0.clone_prefilter()) } } impl PrefilterObj { /// Create a new prefilter object. pub fn new(t: T) -> PrefilterObj { PrefilterObj(Box::new(t)) } /// Return the underlying prefilter trait object. pub fn as_ref(&self) -> &dyn Prefilter { &*self.0 } } /// PrefilterState tracks state associated with the effectiveness of a /// prefilter. It is used to track how many bytes, on average, are skipped by /// the prefilter. If this average dips below a certain threshold over time, /// then the state renders the prefilter inert and stops using it. /// /// A prefilter state should be created for each search. (Where creating an /// iterator via, e.g., `find_iter`, is treated as a single search.) #[derive(Clone, Debug)] pub struct PrefilterState { /// The number of skips that has been executed. skips: usize, /// The total number of bytes that have been skipped. skipped: usize, /// The maximum length of a match. This is used to help determine how many /// bytes on average should be skipped in order for a prefilter to be /// effective. max_match_len: usize, /// Once this heuristic has been deemed permanently ineffective, it will be /// inert throughout the rest of its lifetime. This serves as a cheap way /// to check inertness. inert: bool, /// The last (absolute) position at which a prefilter scanned to. /// Prefilters can use this position to determine whether to re-scan or /// not. /// /// Unlike other things that impact effectiveness, this is a fleeting /// condition. That is, a prefilter can be considered ineffective if it is /// at a position before `last_scan_at`, but can become effective again /// once the search moves past `last_scan_at`. /// /// The utility of this is to both avoid additional overhead from calling /// the prefilter and to avoid quadratic behavior. This ensures that a /// prefilter will scan any particular byte at most once. (Note that some /// prefilters, like the start-byte prefilter, do not need to use this /// field at all, since it only looks for starting bytes.) last_scan_at: usize, } impl PrefilterState { /// The minimum number of skip attempts to try before considering whether /// a prefilter is effective or not. const MIN_SKIPS: usize = 40; /// The minimum amount of bytes that skipping must average, expressed as a /// factor of the multiple of the length of a possible match. /// /// That is, after MIN_SKIPS have occurred, if the average number of bytes /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the /// prefilter outed to be rendered inert. const MIN_AVG_FACTOR: usize = 2; /// Create a fresh prefilter state. pub fn new(max_match_len: usize) -> PrefilterState { PrefilterState { skips: 0, skipped: 0, max_match_len, inert: false, last_scan_at: 0, } } /// Update this state with the number of bytes skipped on the last /// invocation of the prefilter. #[inline] fn update_skipped_bytes(&mut self, skipped: usize) { self.skips += 1; self.skipped += skipped; } /// Updates the position at which the last scan stopped. This may be /// greater than the position of the last candidate reported. For example, /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz` /// will report a candidate at position `0`, but the end of its last scan /// will be at position `3`. /// /// This position factors into the effectiveness of this prefilter. If the /// current position is less than the last position at which a scan ended, /// then the prefilter should not be re-run until the search moves past /// that position. #[inline] fn update_at(&mut self, at: usize) { if at > self.last_scan_at { self.last_scan_at = at; } } /// Return true if and only if this state indicates that a prefilter is /// still effective. /// /// The given pos should correspond to the current starting position of the /// search. #[inline] pub fn is_effective(&mut self, at: usize) -> bool { if self.inert { return false; } if at < self.last_scan_at { return false; } if self.skips < PrefilterState::MIN_SKIPS { return true; } let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len; if self.skipped >= min_avg * self.skips { return true; } // We're inert. self.inert = true; false } } /// A builder for constructing the best possible prefilter. When constructed, /// this builder will heuristically select the best prefilter it can build, /// if any, and discard the rest. #[derive(Debug)] pub struct Builder { count: usize, ascii_case_insensitive: bool, start_bytes: StartBytesBuilder, rare_bytes: RareBytesBuilder, packed: Option, } impl Builder { /// Create a new builder for constructing the best possible prefilter. pub fn new(kind: MatchKind) -> Builder { let pbuilder = kind .as_packed() .map(|kind| packed::Config::new().match_kind(kind).builder()); Builder { count: 0, ascii_case_insensitive: false, start_bytes: StartBytesBuilder::new(), rare_bytes: RareBytesBuilder::new(), packed: pbuilder, } } /// Enable ASCII case insensitivity. When set, byte strings added to this /// builder will be interpreted without respect to ASCII case. pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder { self.ascii_case_insensitive = yes; self.start_bytes = self.start_bytes.ascii_case_insensitive(yes); self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes); self } /// Return a prefilter suitable for quickly finding potential matches. /// /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. pub fn build(&self) -> Option { match (self.start_bytes.build(), self.rare_bytes.build()) { // If we could build both start and rare prefilters, then there are // a few cases in which we'd want to use the start-byte prefilter // over the rare-byte prefilter, since the former has lower // overhead. (prestart @ Some(_), prerare @ Some(_)) => { // If the start-byte prefilter can scan for a smaller number // of bytes than the rare-byte prefilter, then it's probably // faster. let has_fewer_bytes = self.start_bytes.count < self.rare_bytes.count; // Otherwise, if the combined frequency rank of the detected // bytes in the start-byte prefilter is "close" to the combined // frequency rank of the rare-byte prefilter, then we pick // the start-byte prefilter even if the rare-byte prefilter // heuristically searches for rare bytes. This is because the // rare-byte prefilter has higher constant costs, so we tend to // prefer the start-byte prefilter when we can. let has_rarer_bytes = self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50; if has_fewer_bytes || has_rarer_bytes { prestart } else { prerare } } (prestart @ Some(_), None) => prestart, (None, prerare @ Some(_)) => prerare, (None, None) if self.ascii_case_insensitive => None, (None, None) => self .packed .as_ref() .and_then(|b| b.build()) .map(|s| PrefilterObj::new(Packed(s))), } } /// Add a literal string to this prefilter builder. pub fn add(&mut self, bytes: &[u8]) { self.count += 1; self.start_bytes.add(bytes); self.rare_bytes.add(bytes); if let Some(ref mut pbuilder) = self.packed { pbuilder.add(bytes); } } } /// A type that wraps a packed searcher and implements the `Prefilter` /// interface. #[derive(Clone, Debug)] struct Packed(packed::Searcher); impl Prefilter for Packed { fn next_candidate( &self, _state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate { self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match) } fn clone_prefilter(&self) -> Box { Box::new(self.clone()) } fn heap_bytes(&self) -> usize { self.0.heap_bytes() } fn reports_false_positives(&self) -> bool { false } } /// A builder for constructing a rare byte prefilter. /// /// A rare byte prefilter attempts to pick out a small set of rare bytes that /// occurr in the patterns, and then quickly scan to matches of those rare /// bytes. #[derive(Clone, Debug)] struct RareBytesBuilder { /// Whether this prefilter should account for ASCII case insensitivity or /// not. ascii_case_insensitive: bool, /// A set of byte offsets associated with detected rare bytes. An entry is /// only set if a rare byte is detected in a pattern. byte_offsets: RareByteOffsets, /// Whether this is available as a prefilter or not. This can be set to /// false during construction if a condition is seen that invalidates the /// use of the rare-byte prefilter. available: bool, /// The number of bytes set to an active value in `byte_offsets`. count: usize, /// The sum of frequency ranks for the rare bytes detected. This is /// intended to give a heuristic notion of how rare the bytes are. rank_sum: u16, } /// A set of rare byte offsets, keyed by byte. #[derive(Clone, Copy)] struct RareByteOffsets { /// When an item in this set has an offset of u8::MAX (255), then it is /// considered unset. set: [RareByteOffset; 256], } impl RareByteOffsets { /// Create a new empty set of rare byte offsets. pub fn empty() -> RareByteOffsets { RareByteOffsets { set: [RareByteOffset::default(); 256] } } /// Add the given offset for the given byte to this set. If the offset is /// greater than the existing offset, then it overwrites the previous /// value and returns false. If there is no previous value set, then this /// sets it and returns true. /// /// The given offset must be active, otherwise this panics. pub fn apply(&mut self, byte: u8, off: RareByteOffset) -> bool { assert!(off.is_active()); let existing = &mut self.set[byte as usize]; if !existing.is_active() { *existing = off; true } else { if existing.max < off.max { *existing = off; } false } } } impl fmt::Debug for RareByteOffsets { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut offsets = vec![]; for off in self.set.iter() { if off.is_active() { offsets.push(off); } } f.debug_struct("RareByteOffsets").field("set", &offsets).finish() } } /// Offsets associated with an occurrence of a "rare" byte in any of the /// patterns used to construct a single Aho-Corasick automaton. #[derive(Clone, Copy, Debug)] struct RareByteOffset { /// The maximum offset at which a particular byte occurs from the start /// of any pattern. This is used as a shift amount. That is, when an /// occurrence of this byte is found, the candidate position reported by /// the prefilter is `position_of_byte - max`, such that the automaton /// will begin its search at a position that is guaranteed to observe a /// match. /// /// To avoid accidentally quadratic behavior, a prefilter is considered /// ineffective when it is asked to start scanning from a position that it /// has already scanned past. /// /// N.B. The maximum value for this is 254. A value of 255 indicates that /// this is unused. If a rare byte is found at an offset of 255 or greater, /// then the rare-byte prefilter is disabled for simplicity. max: u8, } impl Default for RareByteOffset { fn default() -> RareByteOffset { RareByteOffset { max: u8::MAX } } } impl RareByteOffset { /// Create a new rare byte offset. If the given offset is too big, then /// an inactive `RareByteOffset` is returned. fn new(max: usize) -> RareByteOffset { if max > (u8::MAX - 1) as usize { RareByteOffset::default() } else { RareByteOffset { max: max as u8 } } } /// Returns true if and only if this offset is active. If it's inactive, /// then it should not be used. fn is_active(&self) -> bool { self.max < u8::MAX } } impl RareBytesBuilder { /// Create a new builder for constructing a rare byte prefilter. fn new() -> RareBytesBuilder { RareBytesBuilder { ascii_case_insensitive: false, byte_offsets: RareByteOffsets::empty(), available: true, count: 0, rank_sum: 0, } } /// Enable ASCII case insensitivity. When set, byte strings added to this /// builder will be interpreted without respect to ASCII case. fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder { self.ascii_case_insensitive = yes; self } /// Build the rare bytes prefilter. /// /// If there are more than 3 distinct starting bytes, or if heuristics /// otherwise determine that this prefilter should not be used, then `None` /// is returned. fn build(&self) -> Option { if !self.available || self.count > 3 { return None; } let (mut bytes, mut len) = ([0; 3], 0); for b in 0..256 { if self.byte_offsets.set[b].is_active() { bytes[len] = b as u8; len += 1; } } match len { 0 => None, 1 => Some(PrefilterObj::new(RareBytesOne { byte1: bytes[0], offset: self.byte_offsets.set[bytes[0] as usize], })), 2 => Some(PrefilterObj::new(RareBytesTwo { offsets: self.byte_offsets, byte1: bytes[0], byte2: bytes[1], })), 3 => Some(PrefilterObj::new(RareBytesThree { offsets: self.byte_offsets, byte1: bytes[0], byte2: bytes[1], byte3: bytes[2], })), _ => unreachable!(), } } /// Add a byte string to this builder. /// /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. fn add(&mut self, bytes: &[u8]) { // If we've already blown our budget, then don't waste time looking // for more rare bytes. if self.count > 3 { self.available = false; return; } let mut rarest = match bytes.get(0) { None => return, Some(&b) => (b, 0, freq_rank(b)), }; // The idea here is to look for the rarest byte in each pattern, and // add that to our set. As a special exception, if we see a byte that // we've already added, then we immediately stop and choose that byte, // even if there's another rare byte in the pattern. This helps us // apply the rare byte optimization in more cases by attempting to pick // bytes that are in common between patterns. So for example, if we // were searching for `Sherlock` and `lockjaw`, then this would pick // `k` for both patterns, resulting in the use of `memchr` instead of // `memchr2` for `k` and `j`. for (pos, &b) in bytes.iter().enumerate() { if self.byte_offsets.set[b as usize].is_active() { self.add_rare_byte(b, pos); return; } let rank = freq_rank(b); if rank < rarest.2 { rarest = (b, pos, rank); } } self.add_rare_byte(rarest.0, rarest.1); } fn add_rare_byte(&mut self, byte: u8, pos: usize) { self.add_one_byte(byte, pos); if self.ascii_case_insensitive { self.add_one_byte(opposite_ascii_case(byte), pos); } } fn add_one_byte(&mut self, byte: u8, pos: usize) { let off = RareByteOffset::new(pos); if !off.is_active() { self.available = false; return; } if self.byte_offsets.apply(byte, off) { self.count += 1; self.rank_sum += freq_rank(byte) as u16; } } } /// A prefilter for scanning for a single "rare" byte. #[derive(Clone, Debug)] struct RareBytesOne { byte1: u8, offset: RareByteOffset, } impl Prefilter for RareBytesOne { fn next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate { memchr(self.byte1, &haystack[at..]) .map(|i| { let pos = at + i; state.last_scan_at = pos; cmp::max(at, pos.saturating_sub(self.offset.max as usize)) }) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } fn clone_prefilter(&self) -> Box { Box::new(self.clone()) } fn heap_bytes(&self) -> usize { 0 } } /// A prefilter for scanning for two "rare" bytes. #[derive(Clone, Debug)] struct RareBytesTwo { offsets: RareByteOffsets, byte1: u8, byte2: u8, } impl Prefilter for RareBytesTwo { fn next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate { memchr2(self.byte1, self.byte2, &haystack[at..]) .map(|i| { let pos = at + i; state.update_at(pos); let offset = self.offsets.set[haystack[pos] as usize].max; cmp::max(at, pos.saturating_sub(offset as usize)) }) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } fn clone_prefilter(&self) -> Box { Box::new(self.clone()) } fn heap_bytes(&self) -> usize { 0 } } /// A prefilter for scanning for three "rare" bytes. #[derive(Clone, Debug)] struct RareBytesThree { offsets: RareByteOffsets, byte1: u8, byte2: u8, byte3: u8, } impl Prefilter for RareBytesThree { fn next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate { memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) .map(|i| { let pos = at + i; state.update_at(pos); let offset = self.offsets.set[haystack[pos] as usize].max; cmp::max(at, pos.saturating_sub(offset as usize)) }) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } fn clone_prefilter(&self) -> Box { Box::new(self.clone()) } fn heap_bytes(&self) -> usize { 0 } } /// A builder for constructing a starting byte prefilter. /// /// A starting byte prefilter is a simplistic prefilter that looks for possible /// matches by reporting all positions corresponding to a particular byte. This /// generally only takes affect when there are at most 3 distinct possible /// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two /// distinct starting bytes (`f` and `b`), and this prefilter returns all /// occurrences of either `f` or `b`. /// /// In some cases, a heuristic frequency analysis may determine that it would /// be better not to use this prefilter even when there are 3 or fewer distinct /// starting bytes. #[derive(Clone, Debug)] struct StartBytesBuilder { /// Whether this prefilter should account for ASCII case insensitivity or /// not. ascii_case_insensitive: bool, /// The set of starting bytes observed. byteset: Vec, /// The number of bytes set to true in `byteset`. count: usize, /// The sum of frequency ranks for the rare bytes detected. This is /// intended to give a heuristic notion of how rare the bytes are. rank_sum: u16, } impl StartBytesBuilder { /// Create a new builder for constructing a start byte prefilter. fn new() -> StartBytesBuilder { StartBytesBuilder { ascii_case_insensitive: false, byteset: vec![false; 256], count: 0, rank_sum: 0, } } /// Enable ASCII case insensitivity. When set, byte strings added to this /// builder will be interpreted without respect to ASCII case. fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder { self.ascii_case_insensitive = yes; self } /// Build the starting bytes prefilter. /// /// If there are more than 3 distinct starting bytes, or if heuristics /// otherwise determine that this prefilter should not be used, then `None` /// is returned. fn build(&self) -> Option { if self.count > 3 { return None; } let (mut bytes, mut len) = ([0; 3], 0); for b in 0..256 { if !self.byteset[b] { continue; } // We don't handle non-ASCII bytes for now. Getting non-ASCII // bytes right is trickier, since we generally don't want to put // a leading UTF-8 code unit into a prefilter that isn't ASCII, // since they can frequently. Instead, it would be better to use a // continuation byte, but this requires more sophisticated analysis // of the automaton and a richer prefilter API. if b > 0x7F { return None; } bytes[len] = b as u8; len += 1; } match len { 0 => None, 1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })), 2 => Some(PrefilterObj::new(StartBytesTwo { byte1: bytes[0], byte2: bytes[1], })), 3 => Some(PrefilterObj::new(StartBytesThree { byte1: bytes[0], byte2: bytes[1], byte3: bytes[2], })), _ => unreachable!(), } } /// Add a byte string to this builder. /// /// All patterns added to an Aho-Corasick automaton should be added to this /// builder before attempting to construct the prefilter. fn add(&mut self, bytes: &[u8]) { if self.count > 3 { return; } if let Some(&byte) = bytes.get(0) { self.add_one_byte(byte); if self.ascii_case_insensitive { self.add_one_byte(opposite_ascii_case(byte)); } } } fn add_one_byte(&mut self, byte: u8) { if !self.byteset[byte as usize] { self.byteset[byte as usize] = true; self.count += 1; self.rank_sum += freq_rank(byte) as u16; } } } /// A prefilter for scanning for a single starting byte. #[derive(Clone, Debug)] struct StartBytesOne { byte1: u8, } impl Prefilter for StartBytesOne { fn next_candidate( &self, _state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate { memchr(self.byte1, &haystack[at..]) .map(|i| at + i) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } fn clone_prefilter(&self) -> Box { Box::new(self.clone()) } fn heap_bytes(&self) -> usize { 0 } } /// A prefilter for scanning for two starting bytes. #[derive(Clone, Debug)] struct StartBytesTwo { byte1: u8, byte2: u8, } impl Prefilter for StartBytesTwo { fn next_candidate( &self, _state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate { memchr2(self.byte1, self.byte2, &haystack[at..]) .map(|i| at + i) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } fn clone_prefilter(&self) -> Box { Box::new(self.clone()) } fn heap_bytes(&self) -> usize { 0 } } /// A prefilter for scanning for three starting bytes. #[derive(Clone, Debug)] struct StartBytesThree { byte1: u8, byte2: u8, byte3: u8, } impl Prefilter for StartBytesThree { fn next_candidate( &self, _state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate { memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..]) .map(|i| at + i) .map_or(Candidate::None, Candidate::PossibleStartOfMatch) } fn clone_prefilter(&self) -> Box { Box::new(self.clone()) } fn heap_bytes(&self) -> usize { 0 } } /// Return the next candidate reported by the given prefilter while /// simultaneously updating the given prestate. /// /// The caller is responsible for checking the prestate before deciding whether /// to initiate a search. #[inline] pub fn next( prestate: &mut PrefilterState, prefilter: P, haystack: &[u8], at: usize, ) -> Candidate { let cand = prefilter.next_candidate(prestate, haystack, at); match cand { Candidate::None => { prestate.update_skipped_bytes(haystack.len() - at); } Candidate::Match(ref m) => { prestate.update_skipped_bytes(m.start() - at); } Candidate::PossibleStartOfMatch(i) => { prestate.update_skipped_bytes(i - at); } } cand } /// If the given byte is an ASCII letter, then return it in the opposite case. /// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns /// `b'A'`. If a non-ASCII letter is given, then the given byte is returned. pub fn opposite_ascii_case(b: u8) -> u8 { if b'A' <= b && b <= b'Z' { b.to_ascii_lowercase() } else if b'a' <= b && b <= b'z' { b.to_ascii_uppercase() } else { b } } /// Return the frequency rank of the given byte. The higher the rank, the more /// common the byte (heuristically speaking). fn freq_rank(b: u8) -> u8 { use byte_frequencies::BYTE_FREQUENCIES; BYTE_FREQUENCIES[b as usize] } #[cfg(test)] mod tests { use super::*; #[test] fn scratch() { let mut b = Builder::new(MatchKind::LeftmostFirst); b.add(b"Sherlock"); b.add(b"locjaw"); // b.add(b"Sherlock"); // b.add(b"Holmes"); // b.add(b"Watson"); // b.add("Шерлок Холмс".as_bytes()); // b.add("Джон Уотсон".as_bytes()); let s = b.build().unwrap(); println!("{:?}", s); } } aho-corasick-0.7.8/src/state_id.rs010066400017500001731000000117711361627453200152640ustar0000000000000000use std::fmt::Debug; use std::hash::Hash; use error::{Error, Result}; // NOTE: Most of this code was copied from regex-automata, but without the // (de)serialization specific stuff. /// Check that the premultiplication of the given state identifier can /// fit into the representation indicated by `S`. If it cannot, or if it /// overflows `usize` itself, then an error is returned. pub fn premultiply_overflow_error( last_state: S, alphabet_len: usize, ) -> Result<()> { let requested = match last_state.to_usize().checked_mul(alphabet_len) { Some(requested) => requested, None => return Err(Error::premultiply_overflow(0, 0)), }; if requested > S::max_id() { return Err(Error::premultiply_overflow(S::max_id(), requested)); } Ok(()) } /// Convert the given `usize` to the chosen state identifier /// representation. If the given value cannot fit in the chosen /// representation, then an error is returned. pub fn usize_to_state_id(value: usize) -> Result { if value > S::max_id() { Err(Error::state_id_overflow(S::max_id())) } else { Ok(S::from_usize(value)) } } /// Return the unique identifier for an automaton's fail state in the chosen /// representation indicated by `S`. pub fn fail_id() -> S { S::from_usize(0) } /// Return the unique identifier for an automaton's fail state in the chosen /// representation indicated by `S`. pub fn dead_id() -> S { S::from_usize(1) } mod private { /// Sealed stops crates other than aho-corasick from implementing any /// traits that use it. pub trait Sealed {} impl Sealed for u8 {} impl Sealed for u16 {} impl Sealed for u32 {} impl Sealed for u64 {} impl Sealed for usize {} } /// A trait describing the representation of an automaton's state identifier. /// /// The purpose of this trait is to safely express both the possible state /// identifier representations that can be used in an automaton and to convert /// between state identifier representations and types that can be used to /// efficiently index memory (such as `usize`). /// /// In general, one should not need to implement this trait explicitly. Indeed, /// for now, this trait is sealed such that it cannot be implemented by any /// other type. In particular, this crate provides implementations for `u8`, /// `u16`, `u32`, `u64` and `usize`. (`u32` and `u64` are only provided for /// targets that can represent all corresponding values in a `usize`.) pub trait StateID: private::Sealed + Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord { /// Convert from a `usize` to this implementation's representation. /// /// Implementors may assume that `n <= Self::max_id`. That is, implementors /// do not need to check whether `n` can fit inside this implementation's /// representation. fn from_usize(n: usize) -> Self; /// Convert this implementation's representation to a `usize`. /// /// Implementors must not return a `usize` value greater than /// `Self::max_id` and must not permit overflow when converting between the /// implementor's representation and `usize`. In general, the preferred /// way for implementors to achieve this is to simply not provide /// implementations of `StateID` that cannot fit into the target platform's /// `usize`. fn to_usize(self) -> usize; /// Return the maximum state identifier supported by this representation. /// /// Implementors must return a correct bound. Doing otherwise may result /// in unspecified behavior (but will not violate memory safety). fn max_id() -> usize; } impl StateID for usize { #[inline] fn from_usize(n: usize) -> usize { n } #[inline] fn to_usize(self) -> usize { self } #[inline] fn max_id() -> usize { ::std::usize::MAX } } impl StateID for u8 { #[inline] fn from_usize(n: usize) -> u8 { n as u8 } #[inline] fn to_usize(self) -> usize { self as usize } #[inline] fn max_id() -> usize { ::std::u8::MAX as usize } } impl StateID for u16 { #[inline] fn from_usize(n: usize) -> u16 { n as u16 } #[inline] fn to_usize(self) -> usize { self as usize } #[inline] fn max_id() -> usize { ::std::u16::MAX as usize } } #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] impl StateID for u32 { #[inline] fn from_usize(n: usize) -> u32 { n as u32 } #[inline] fn to_usize(self) -> usize { self as usize } #[inline] fn max_id() -> usize { ::std::u32::MAX as usize } } #[cfg(target_pointer_width = "64")] impl StateID for u64 { #[inline] fn from_usize(n: usize) -> u64 { n as u64 } #[inline] fn to_usize(self) -> usize { self as usize } #[inline] fn max_id() -> usize { ::std::u64::MAX as usize } } aho-corasick-0.7.8/src/tests.rs010066400017500001731000001117741361627453200146360ustar0000000000000000use std::collections::HashMap; use std::io; use std::usize; use {AhoCorasickBuilder, Match, MatchKind}; /// A description of a single test against an Aho-Corasick automaton. /// /// A single test may not necessarily pass on every configuration of an /// Aho-Corasick automaton. The tests are categorized and grouped appropriately /// below. #[derive(Clone, Debug, Eq, PartialEq)] struct SearchTest { /// The name of this test, for debugging. name: &'static str, /// The patterns to search for. patterns: &'static [&'static str], /// The text to search. haystack: &'static str, /// Each match is a triple of (pattern_index, start, end), where /// pattern_index is an index into `patterns` and `start`/`end` are indices /// into `haystack`. matches: &'static [(usize, usize, usize)], } /// Short-hand constructor for SearchTest. We use it a lot below. macro_rules! t { ($name:ident, $patterns:expr, $haystack:expr, $matches:expr) => { SearchTest { name: stringify!($name), patterns: $patterns, haystack: $haystack, matches: $matches, } }; } /// A collection of test groups. type TestCollection = &'static [&'static [SearchTest]]; // Define several collections corresponding to the different type of match // semantics supported by Aho-Corasick. These collections have some overlap, // but each collection should have some tests that no other collection has. /// Tests for Aho-Corasick's standard non-overlapping match semantics. const AC_STANDARD_NON_OVERLAPPING: TestCollection = &[BASICS, NON_OVERLAPPING, STANDARD, REGRESSION]; /// Tests for Aho-Corasick's anchored standard non-overlapping match semantics. const AC_STANDARD_ANCHORED_NON_OVERLAPPING: TestCollection = &[ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, STANDARD_ANCHORED]; /// Tests for Aho-Corasick's standard overlapping match semantics. const AC_STANDARD_OVERLAPPING: TestCollection = &[BASICS, OVERLAPPING, REGRESSION]; /// Tests for Aho-Corasick's anchored standard overlapping match semantics. const AC_STANDARD_ANCHORED_OVERLAPPING: TestCollection = &[ANCHORED_BASICS, ANCHORED_OVERLAPPING]; /// Tests for Aho-Corasick's leftmost-first match semantics. const AC_LEFTMOST_FIRST: TestCollection = &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_FIRST, REGRESSION]; /// Tests for Aho-Corasick's anchored leftmost-first match semantics. const AC_LEFTMOST_FIRST_ANCHORED: TestCollection = &[ ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, ANCHORED_LEFTMOST, ANCHORED_LEFTMOST_FIRST, ]; /// Tests for Aho-Corasick's leftmost-longest match semantics. const AC_LEFTMOST_LONGEST: TestCollection = &[BASICS, NON_OVERLAPPING, LEFTMOST, LEFTMOST_LONGEST, REGRESSION]; /// Tests for Aho-Corasick's anchored leftmost-longest match semantics. const AC_LEFTMOST_LONGEST_ANCHORED: TestCollection = &[ ANCHORED_BASICS, ANCHORED_NON_OVERLAPPING, ANCHORED_LEFTMOST, ANCHORED_LEFTMOST_LONGEST, ]; // Now define the individual tests that make up the collections above. /// A collection of tests for the Aho-Corasick algorithm that should always be /// true regardless of match semantics. That is, all combinations of /// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} /// should produce the same answer. const BASICS: &'static [SearchTest] = &[ t!(basic000, &[], "", &[]), t!(basic001, &["a"], "", &[]), t!(basic010, &["a"], "a", &[(0, 0, 1)]), t!(basic020, &["a"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(basic030, &["a"], "aaa", &[(0, 0, 1), (0, 1, 2), (0, 2, 3)]), t!(basic040, &["a"], "aba", &[(0, 0, 1), (0, 2, 3)]), t!(basic050, &["a"], "bba", &[(0, 2, 3)]), t!(basic060, &["a"], "bbb", &[]), t!(basic070, &["a"], "bababbbba", &[(0, 1, 2), (0, 3, 4), (0, 8, 9)]), t!(basic100, &["aa"], "", &[]), t!(basic110, &["aa"], "aa", &[(0, 0, 2)]), t!(basic120, &["aa"], "aabbaa", &[(0, 0, 2), (0, 4, 6)]), t!(basic130, &["aa"], "abbab", &[]), t!(basic140, &["aa"], "abbabaa", &[(0, 5, 7)]), t!(basic200, &["abc"], "abc", &[(0, 0, 3)]), t!(basic210, &["abc"], "zazabzabcz", &[(0, 6, 9)]), t!(basic220, &["abc"], "zazabczabcz", &[(0, 3, 6), (0, 7, 10)]), t!(basic300, &["a", "b"], "", &[]), t!(basic310, &["a", "b"], "z", &[]), t!(basic320, &["a", "b"], "b", &[(1, 0, 1)]), t!(basic330, &["a", "b"], "a", &[(0, 0, 1)]), t!( basic340, &["a", "b"], "abba", &[(0, 0, 1), (1, 1, 2), (1, 2, 3), (0, 3, 4),] ), t!( basic350, &["b", "a"], "abba", &[(1, 0, 1), (0, 1, 2), (0, 2, 3), (1, 3, 4),] ), t!(basic360, &["abc", "bc"], "xbc", &[(1, 1, 3),]), t!(basic400, &["foo", "bar"], "", &[]), t!(basic410, &["foo", "bar"], "foobar", &[(0, 0, 3), (1, 3, 6),]), t!(basic420, &["foo", "bar"], "barfoo", &[(1, 0, 3), (0, 3, 6),]), t!(basic430, &["foo", "bar"], "foofoo", &[(0, 0, 3), (0, 3, 6),]), t!(basic440, &["foo", "bar"], "barbar", &[(1, 0, 3), (1, 3, 6),]), t!(basic450, &["foo", "bar"], "bafofoo", &[(0, 4, 7),]), t!(basic460, &["bar", "foo"], "bafofoo", &[(1, 4, 7),]), t!(basic470, &["foo", "bar"], "fobabar", &[(1, 4, 7),]), t!(basic480, &["bar", "foo"], "fobabar", &[(0, 4, 7),]), t!(basic600, &[""], "", &[(0, 0, 0)]), t!(basic610, &[""], "a", &[(0, 0, 0), (0, 1, 1)]), t!(basic620, &[""], "abc", &[(0, 0, 0), (0, 1, 1), (0, 2, 2), (0, 3, 3)]), t!(basic700, &["yabcdef", "abcdezghi"], "yabcdefghi", &[(0, 0, 7),]), t!(basic710, &["yabcdef", "abcdezghi"], "yabcdezghi", &[(1, 1, 10),]), t!( basic720, &["yabcdef", "bcdeyabc", "abcdezghi"], "yabcdezghi", &[(2, 1, 10),] ), ]; /// A collection of *anchored* tests for the Aho-Corasick algorithm that should /// always be true regardless of match semantics. That is, all combinations of /// leftmost-{shortest, first, longest} x {overlapping, non-overlapping} should /// produce the same answer. const ANCHORED_BASICS: &'static [SearchTest] = &[ t!(abasic000, &[], "", &[]), t!(abasic010, &[""], "", &[(0, 0, 0)]), t!(abasic020, &[""], "a", &[(0, 0, 0)]), t!(abasic030, &[""], "abc", &[(0, 0, 0)]), t!(abasic100, &["a"], "a", &[(0, 0, 1)]), t!(abasic110, &["a"], "aa", &[(0, 0, 1)]), t!(abasic120, &["a", "b"], "ab", &[(0, 0, 1)]), t!(abasic130, &["a", "b"], "ba", &[(1, 0, 1)]), t!(abasic140, &["foo", "foofoo"], "foo", &[(0, 0, 3)]), t!(abasic150, &["foofoo", "foo"], "foo", &[(1, 0, 3)]), ]; /// Tests for non-overlapping standard match semantics. /// /// These tests generally shouldn't pass for leftmost-{first,longest}, although /// some do in order to write clearer tests. For example, standard000 will /// pass with leftmost-first semantics, but standard010 will not. We write /// both to emphasize how the match semantics work. const STANDARD: &'static [SearchTest] = &[ t!(standard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(standard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), t!(standard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), t!(standard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), t!(standard040, &["a", ""], "a", &[(1, 0, 0), (1, 1, 1)]), t!( standard400, &["abcd", "bcd", "cd", "b"], "abcd", &[(3, 1, 2), (2, 2, 4),] ), t!(standard410, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), t!(standard420, &["", "a"], "aa", &[(0, 0, 0), (0, 1, 1), (0, 2, 2),]), t!(standard430, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), t!(standard440, &["a", "", ""], "a", &[(1, 0, 0), (1, 1, 1),]), t!(standard450, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), ]; /// Like STANDARD, but for anchored searches. const STANDARD_ANCHORED: &'static [SearchTest] = &[ t!(astandard000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(astandard010, &["abcd", "ab"], "abcd", &[(1, 0, 2)]), t!(astandard020, &["abcd", "ab", "abc"], "abcd", &[(1, 0, 2)]), t!(astandard030, &["abcd", "abc", "ab"], "abcd", &[(2, 0, 2)]), t!(astandard040, &["a", ""], "a", &[(1, 0, 0)]), t!(astandard050, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), t!(astandard410, &["", "a"], "a", &[(0, 0, 0)]), t!(astandard420, &["", "a"], "aa", &[(0, 0, 0)]), t!(astandard430, &["", "a", ""], "a", &[(0, 0, 0)]), t!(astandard440, &["a", "", ""], "a", &[(1, 0, 0)]), t!(astandard450, &["", "", "a"], "a", &[(0, 0, 0)]), ]; /// Tests for non-overlapping leftmost match semantics. These should pass for /// both leftmost-first and leftmost-longest match kinds. Stated differently, /// among ambiguous matches, the longest match and the match that appeared /// first when constructing the automaton should always be the same. const LEFTMOST: &'static [SearchTest] = &[ t!(leftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(leftmost010, &["a", ""], "a", &[(0, 0, 1), (1, 1, 1)]), t!(leftmost020, &["", ""], "a", &[(0, 0, 0), (0, 1, 1)]), t!(leftmost030, &["a", "ab"], "aa", &[(0, 0, 1), (0, 1, 2)]), t!(leftmost031, &["ab", "a"], "aa", &[(1, 0, 1), (1, 1, 2)]), t!(leftmost032, &["ab", "a"], "xayabbbz", &[(1, 1, 2), (0, 3, 5)]), t!(leftmost300, &["abcd", "bce", "b"], "abce", &[(1, 1, 4)]), t!(leftmost310, &["abcd", "ce", "bc"], "abce", &[(2, 1, 3)]), t!(leftmost320, &["abcd", "bce", "ce", "b"], "abce", &[(1, 1, 4)]), t!(leftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[(3, 1, 3)]), t!(leftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), t!(leftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), t!( leftmost360, &["abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(2, 0, 8),] ), t!( leftmost370, &["abcdefghi", "cde", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost380, &["abcdefghi", "hz", "abcdefgh", "a"], "abcdefghz", &[(2, 0, 8),] ), t!( leftmost390, &["b", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost400, &["h", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( leftmost410, &["z", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8), (0, 8, 9),] ), ]; /// Like LEFTMOST, but for anchored searches. const ANCHORED_LEFTMOST: &'static [SearchTest] = &[ t!(aleftmost000, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(aleftmost010, &["a", ""], "a", &[(0, 0, 1)]), t!(aleftmost020, &["", ""], "a", &[(0, 0, 0)]), t!(aleftmost030, &["a", "ab"], "aa", &[(0, 0, 1)]), t!(aleftmost031, &["ab", "a"], "aa", &[(1, 0, 1)]), t!(aleftmost032, &["ab", "a"], "xayabbbz", &[]), t!(aleftmost300, &["abcd", "bce", "b"], "abce", &[]), t!(aleftmost310, &["abcd", "ce", "bc"], "abce", &[]), t!(aleftmost320, &["abcd", "bce", "ce", "b"], "abce", &[]), t!(aleftmost330, &["abcd", "bce", "cz", "bc"], "abcz", &[]), t!(aleftmost340, &["bce", "cz", "bc"], "bcz", &[(2, 0, 2)]), t!(aleftmost350, &["abc", "bd", "ab"], "abd", &[(2, 0, 2)]), t!( aleftmost360, &["abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(2, 0, 8),] ), t!( aleftmost370, &["abcdefghi", "cde", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( aleftmost380, &["abcdefghi", "hz", "abcdefgh", "a"], "abcdefghz", &[(2, 0, 8),] ), t!( aleftmost390, &["b", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( aleftmost400, &["h", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!( aleftmost410, &["z", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8)] ), ]; /// Tests for non-overlapping leftmost-first match semantics. These tests /// should generally be specific to leftmost-first, which means they should /// generally fail under leftmost-longest semantics. const LEFTMOST_FIRST: &'static [SearchTest] = &[ t!(leftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(leftfirst010, &["", "a"], "a", &[(0, 0, 0), (0, 1, 1)]), t!(leftfirst011, &["", "a", ""], "a", &[(0, 0, 0), (0, 1, 1),]), t!(leftfirst012, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]), t!(leftfirst013, &["", "", "a"], "a", &[(0, 0, 0), (0, 1, 1),]), t!(leftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), t!(leftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(leftfirst040, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (0, 3, 4)]), t!(leftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(1, 1, 5)]), t!(leftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), t!(leftfirst300, &["abcd", "b", "bce"], "abce", &[(1, 1, 2)]), t!( leftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[(1, 1, 2), (3, 2, 4),] ), t!( leftfirst320, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(0, 0, 1), (2, 7, 9),] ), t!(leftfirst330, &["a", "abab"], "abab", &[(0, 0, 1), (0, 2, 3)]), ]; /// Like LEFTMOST_FIRST, but for anchored searches. const ANCHORED_LEFTMOST_FIRST: &'static [SearchTest] = &[ t!(aleftfirst000, &["ab", "abcd"], "abcd", &[(0, 0, 2)]), t!(aleftfirst010, &["", "a"], "a", &[(0, 0, 0)]), t!(aleftfirst011, &["", "a", ""], "a", &[(0, 0, 0)]), t!(aleftfirst012, &["a", "", ""], "a", &[(0, 0, 1)]), t!(aleftfirst013, &["", "", "a"], "a", &[(0, 0, 0)]), t!(aleftfirst020, &["abcd", "ab"], "abcd", &[(0, 0, 4)]), t!(aleftfirst030, &["ab", "ab"], "abcd", &[(0, 0, 2)]), t!(aleftfirst040, &["a", "ab"], "xayabbbz", &[]), t!(aleftfirst100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), t!(aleftfirst110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), t!(aleftfirst300, &["abcd", "b", "bce"], "abce", &[]), t!(aleftfirst310, &["abcd", "b", "bce", "ce"], "abce", &[]), t!( aleftfirst320, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(0, 0, 1)] ), t!(aleftfirst330, &["a", "abab"], "abab", &[(0, 0, 1)]), ]; /// Tests for non-overlapping leftmost-longest match semantics. These tests /// should generally be specific to leftmost-longest, which means they should /// generally fail under leftmost-first semantics. const LEFTMOST_LONGEST: &'static [SearchTest] = &[ t!(leftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), t!(leftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), t!(leftlong020, &["", "a"], "a", &[(1, 0, 1), (0, 1, 1),]), t!(leftlong021, &["", "a", ""], "a", &[(1, 0, 1), (0, 1, 1),]), t!(leftlong022, &["a", "", ""], "a", &[(0, 0, 1), (1, 1, 1),]), t!(leftlong023, &["", "", "a"], "a", &[(2, 0, 1), (0, 1, 1),]), t!(leftlong030, &["", "a"], "aa", &[(1, 0, 1), (1, 1, 2), (0, 2, 2),]), t!(leftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), t!(leftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), t!(leftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), t!(leftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), t!(leftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[(2, 1, 6)]), t!(leftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[(1, 1, 6)]), t!(leftlong300, &["abcd", "b", "bce"], "abce", &[(2, 1, 4)]), t!( leftlong310, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!(leftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), t!(leftlong330, &["abcd", "b", "ce"], "abce", &[(1, 1, 2), (2, 2, 4),]), t!(leftlong340, &["a", "ab"], "xayabbbz", &[(0, 1, 2), (1, 3, 5)]), ]; /// Like LEFTMOST_LONGEST, but for anchored searches. const ANCHORED_LEFTMOST_LONGEST: &'static [SearchTest] = &[ t!(aleftlong000, &["ab", "abcd"], "abcd", &[(1, 0, 4)]), t!(aleftlong010, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4),]), t!(aleftlong020, &["", "a"], "a", &[(1, 0, 1)]), t!(aleftlong021, &["", "a", ""], "a", &[(1, 0, 1)]), t!(aleftlong022, &["a", "", ""], "a", &[(0, 0, 1)]), t!(aleftlong023, &["", "", "a"], "a", &[(2, 0, 1)]), t!(aleftlong030, &["", "a"], "aa", &[(1, 0, 1)]), t!(aleftlong040, &["a", "ab"], "a", &[(0, 0, 1)]), t!(aleftlong050, &["a", "ab"], "ab", &[(1, 0, 2)]), t!(aleftlong060, &["ab", "a"], "a", &[(1, 0, 1)]), t!(aleftlong070, &["ab", "a"], "ab", &[(0, 0, 2)]), t!(aleftlong100, &["abcdefg", "bcde", "bcdef"], "abcdef", &[]), t!(aleftlong110, &["abcdefg", "bcdef", "bcde"], "abcdef", &[]), t!(aleftlong300, &["abcd", "b", "bce"], "abce", &[]), t!( aleftlong310, &["a", "abcdefghi", "hz", "abcdefgh"], "abcdefghz", &[(3, 0, 8),] ), t!(aleftlong320, &["a", "abab"], "abab", &[(1, 0, 4)]), t!(aleftlong330, &["abcd", "b", "ce"], "abce", &[]), t!(aleftlong340, &["a", "ab"], "xayabbbz", &[]), ]; /// Tests for non-overlapping match semantics. /// /// Generally these tests shouldn't pass when using overlapping semantics. /// These should pass for both standard and leftmost match semantics. const NON_OVERLAPPING: &'static [SearchTest] = &[ t!(nover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), t!(nover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), t!(nover030, &["abc", "bc"], "zazabcz", &[(0, 3, 6),]), t!( nover100, &["ab", "ba"], "abababa", &[(0, 0, 2), (0, 2, 4), (0, 4, 6),] ), t!(nover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (0, 6, 9),]), t!(nover300, &["", ""], "", &[(0, 0, 0),]), t!(nover310, &["", ""], "a", &[(0, 0, 0), (0, 1, 1),]), ]; /// Like NON_OVERLAPPING, but for anchored searches. const ANCHORED_NON_OVERLAPPING: &'static [SearchTest] = &[ t!(anover010, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4),]), t!(anover020, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4),]), t!(anover030, &["abc", "bc"], "zazabcz", &[]), t!(anover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]), t!(anover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3)]), t!(anover300, &["", ""], "", &[(0, 0, 0),]), t!(anover310, &["", ""], "a", &[(0, 0, 0)]), ]; /// Tests for overlapping match semantics. /// /// This only supports standard match semantics, since leftmost-{first,longest} /// do not support overlapping matches. const OVERLAPPING: &'static [SearchTest] = &[ t!( over000, &["abcd", "bcd", "cd", "b"], "abcd", &[(3, 1, 2), (0, 0, 4), (1, 1, 4), (2, 2, 4),] ), t!( over010, &["bcd", "cd", "b", "abcd"], "abcd", &[(2, 1, 2), (3, 0, 4), (0, 1, 4), (1, 2, 4),] ), t!( over020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4), (1, 1, 4), (2, 2, 4),] ), t!( over030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4), (0, 1, 4), (2, 2, 4),] ), t!( over040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4), (0, 1, 4), (1, 2, 4),] ), t!(over050, &["abc", "bc"], "zazabcz", &[(0, 3, 6), (1, 4, 6),]), t!( over100, &["ab", "ba"], "abababa", &[(0, 0, 2), (1, 1, 3), (0, 2, 4), (1, 3, 5), (0, 4, 6), (1, 5, 7),] ), t!( over200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3), (0, 6, 9), (1, 6, 9),] ), t!(over300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), t!( over310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0), (0, 1, 1), (1, 1, 1),] ), t!(over320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1), (0, 1, 1),]), t!( over330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1), (0, 1, 1), (2, 1, 1),] ), t!( over340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1), (1, 1, 1), (2, 1, 1),] ), t!( over350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1), (0, 1, 1), (1, 1, 1),] ), t!( over360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6), (0, 3, 6)] ), ]; /// Like OVERLAPPING, but for anchored searches. const ANCHORED_OVERLAPPING: &'static [SearchTest] = &[ t!(aover000, &["abcd", "bcd", "cd", "b"], "abcd", &[(0, 0, 4)]), t!(aover010, &["bcd", "cd", "b", "abcd"], "abcd", &[(3, 0, 4)]), t!(aover020, &["abcd", "bcd", "cd"], "abcd", &[(0, 0, 4)]), t!(aover030, &["bcd", "abcd", "cd"], "abcd", &[(1, 0, 4)]), t!(aover040, &["bcd", "cd", "abcd"], "abcd", &[(2, 0, 4)]), t!(aover050, &["abc", "bc"], "zazabcz", &[]), t!(aover100, &["ab", "ba"], "abababa", &[(0, 0, 2)]), t!(aover200, &["foo", "foo"], "foobarfoo", &[(0, 0, 3), (1, 0, 3)]), t!(aover300, &["", ""], "", &[(0, 0, 0), (1, 0, 0),]), t!(aover310, &["", ""], "a", &[(0, 0, 0), (1, 0, 0)]), t!(aover320, &["", "a"], "a", &[(0, 0, 0), (1, 0, 1)]), t!(aover330, &["", "a", ""], "a", &[(0, 0, 0), (2, 0, 0), (1, 0, 1)]), t!(aover340, &["a", "", ""], "a", &[(1, 0, 0), (2, 0, 0), (0, 0, 1)]), t!(aover350, &["", "", "a"], "a", &[(0, 0, 0), (1, 0, 0), (2, 0, 1)]), t!(aover360, &["foo", "foofoo"], "foofoo", &[(0, 0, 3), (1, 0, 6)]), ]; /// Tests for ASCII case insensitivity. /// /// These tests should all have the same behavior regardless of match semantics /// or whether the search is overlapping. const ASCII_CASE_INSENSITIVE: &'static [SearchTest] = &[ t!(acasei000, &["a"], "A", &[(0, 0, 1)]), t!(acasei010, &["Samwise"], "SAMWISE", &[(0, 0, 7)]), t!(acasei011, &["Samwise"], "SAMWISE.abcd", &[(0, 0, 7)]), t!(acasei020, &["fOoBaR"], "quux foobar baz", &[(0, 5, 11)]), ]; /// Like ASCII_CASE_INSENSITIVE, but specifically for non-overlapping tests. const ASCII_CASE_INSENSITIVE_NON_OVERLAPPING: &'static [SearchTest] = &[ t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3)]), t!(acasei000, &["FOO", "foo"], "fOo", &[(0, 0, 3)]), ]; /// Like ASCII_CASE_INSENSITIVE, but specifically for overlapping tests. const ASCII_CASE_INSENSITIVE_OVERLAPPING: &'static [SearchTest] = &[ t!(acasei000, &["foo", "FOO"], "fOo", &[(0, 0, 3), (1, 0, 3)]), t!(acasei001, &["FOO", "foo"], "fOo", &[(0, 0, 3), (1, 0, 3)]), ]; /// Regression tests that are applied to all Aho-Corasick combinations. /// /// If regression tests are needed for specific match semantics, then add them /// to the appropriate group above. const REGRESSION: &'static [SearchTest] = &[ t!(regression010, &["inf", "ind"], "infind", &[(0, 0, 3), (1, 3, 6),]), t!(regression020, &["ind", "inf"], "infind", &[(1, 0, 3), (0, 3, 6),]), t!( regression030, &["libcore/", "libstd/"], "libcore/char/methods.rs", &[(0, 0, 8),] ), t!( regression040, &["libstd/", "libcore/"], "libcore/char/methods.rs", &[(1, 0, 8),] ), t!( regression050, &["\x00\x00\x01", "\x00\x00\x00"], "\x00\x00\x00", &[(1, 0, 3),] ), t!( regression060, &["\x00\x00\x00", "\x00\x00\x01"], "\x00\x00\x00", &[(0, 0, 3),] ), ]; // Now define a test for each combination of things above that we want to run. // Since there are a few different combinations for each collection of tests, // we define a couple of macros to avoid repetition drudgery. The testconfig // macro constructs the automaton from a given match kind, and runs the search // tests one-by-one over the given collection. The `with` parameter allows one // to configure the builder with additional parameters. The testcombo macro // invokes testconfig in precisely this way: it sets up several tests where // each one turns a different knob on AhoCorasickBuilder. macro_rules! testconfig { (overlapping, $name:ident, $collection:expr, $kind:ident, $with:expr) => { #[test] fn $name() { run_search_tests($collection, |test| { let mut builder = AhoCorasickBuilder::new(); $with(&mut builder); builder .match_kind(MatchKind::$kind) .build(test.patterns) .find_overlapping_iter(test.haystack) .collect() }); } }; (stream, $name:ident, $collection:expr, $kind:ident, $with:expr) => { #[test] fn $name() { run_search_tests($collection, |test| { let buf = io::BufReader::with_capacity(1, test.haystack.as_bytes()); let mut builder = AhoCorasickBuilder::new(); $with(&mut builder); builder .match_kind(MatchKind::$kind) .build(test.patterns) .stream_find_iter(buf) .map(|result| result.unwrap()) .collect() }); } }; ($name:ident, $collection:expr, $kind:ident, $with:expr) => { #[test] fn $name() { run_search_tests($collection, |test| { let mut builder = AhoCorasickBuilder::new(); $with(&mut builder); builder .match_kind(MatchKind::$kind) .build(test.patterns) .find_iter(test.haystack) .collect() }); } }; } macro_rules! testcombo { ($name:ident, $collection:expr, $kind:ident) => { mod $name { use super::*; testconfig!(nfa_default, $collection, $kind, |_| ()); testconfig!( nfa_no_prefilter, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.prefilter(false); } ); testconfig!( nfa_all_sparse, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.dense_depth(0); } ); testconfig!( nfa_all_dense, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.dense_depth(usize::MAX); } ); testconfig!( dfa_default, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.dfa(true); } ); testconfig!( dfa_no_prefilter, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.dfa(true).prefilter(false); } ); testconfig!( dfa_all_sparse, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.dfa(true).dense_depth(0); } ); testconfig!( dfa_all_dense, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.dfa(true).dense_depth(usize::MAX); } ); testconfig!( dfa_no_byte_class, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.dfa(true).byte_classes(false); } ); testconfig!( dfa_no_premultiply, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.dfa(true).premultiply(false); } ); testconfig!( dfa_no_byte_class_no_premultiply, $collection, $kind, |b: &mut AhoCorasickBuilder| { b.dfa(true).byte_classes(false).premultiply(false); } ); } }; } // Write out the combinations. testcombo!(search_leftmost_longest, AC_LEFTMOST_LONGEST, LeftmostLongest); testcombo!(search_leftmost_first, AC_LEFTMOST_FIRST, LeftmostFirst); testcombo!( search_standard_nonoverlapping, AC_STANDARD_NON_OVERLAPPING, Standard ); // Write out the overlapping combo by hand since there is only one of them. testconfig!( overlapping, search_standard_overlapping_nfa_default, AC_STANDARD_OVERLAPPING, Standard, |_| () ); testconfig!( overlapping, search_standard_overlapping_nfa_all_sparse, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.dense_depth(0); } ); testconfig!( overlapping, search_standard_overlapping_nfa_all_dense, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.dense_depth(usize::MAX); } ); testconfig!( overlapping, search_standard_overlapping_dfa_default, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.dfa(true); } ); testconfig!( overlapping, search_standard_overlapping_dfa_all_sparse, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.dfa(true).dense_depth(0); } ); testconfig!( overlapping, search_standard_overlapping_dfa_all_dense, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.dfa(true).dense_depth(usize::MAX); } ); testconfig!( overlapping, search_standard_overlapping_dfa_no_byte_class, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.dfa(true).byte_classes(false); } ); testconfig!( overlapping, search_standard_overlapping_dfa_no_premultiply, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.dfa(true).premultiply(false); } ); testconfig!( overlapping, search_standard_overlapping_dfa_no_byte_class_no_premultiply, AC_STANDARD_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.dfa(true).byte_classes(false).premultiply(false); } ); // Also write out tests manually for streams, since we only test the standard // match semantics. We also don't bother testing different automaton // configurations, since those are well covered by tests above. testconfig!( stream, search_standard_stream_nfa_default, AC_STANDARD_NON_OVERLAPPING, Standard, |_| () ); testconfig!( stream, search_standard_stream_dfa_default, AC_STANDARD_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.dfa(true); } ); // Same thing for anchored searches. Write them out manually. testconfig!( search_standard_anchored_nfa_default, AC_STANDARD_ANCHORED_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.anchored(true); } ); testconfig!( search_standard_anchored_dfa_default, AC_STANDARD_ANCHORED_NON_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.anchored(true).dfa(true); } ); testconfig!( overlapping, search_standard_anchored_overlapping_nfa_default, AC_STANDARD_ANCHORED_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.anchored(true); } ); testconfig!( overlapping, search_standard_anchored_overlapping_dfa_default, AC_STANDARD_ANCHORED_OVERLAPPING, Standard, |b: &mut AhoCorasickBuilder| { b.anchored(true).dfa(true); } ); testconfig!( search_leftmost_first_anchored_nfa_default, AC_LEFTMOST_FIRST_ANCHORED, LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.anchored(true); } ); testconfig!( search_leftmost_first_anchored_dfa_default, AC_LEFTMOST_FIRST_ANCHORED, LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.anchored(true).dfa(true); } ); testconfig!( search_leftmost_longest_anchored_nfa_default, AC_LEFTMOST_LONGEST_ANCHORED, LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.anchored(true); } ); testconfig!( search_leftmost_longest_anchored_dfa_default, AC_LEFTMOST_LONGEST_ANCHORED, LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.anchored(true).dfa(true); } ); // And also write out the test combinations for ASCII case insensitivity. testconfig!( acasei_standard_nfa_default, &[ASCII_CASE_INSENSITIVE], Standard, |b: &mut AhoCorasickBuilder| { b.prefilter(false).ascii_case_insensitive(true); } ); testconfig!( acasei_standard_dfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], Standard, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true).dfa(true); } ); testconfig!( overlapping, acasei_standard_overlapping_nfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], Standard, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true); } ); testconfig!( overlapping, acasei_standard_overlapping_dfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_OVERLAPPING], Standard, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true).dfa(true); } ); testconfig!( acasei_leftmost_first_nfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_first_dfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostFirst, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true).dfa(true); } ); testconfig!( acasei_leftmost_longest_nfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true); } ); testconfig!( acasei_leftmost_longest_dfa_default, &[ASCII_CASE_INSENSITIVE, ASCII_CASE_INSENSITIVE_NON_OVERLAPPING], LeftmostLongest, |b: &mut AhoCorasickBuilder| { b.ascii_case_insensitive(true).dfa(true); } ); #[test] fn search_tests_have_unique_names() { let assert = |constname, tests: &[SearchTest]| { let mut seen = HashMap::new(); // map from test name to position for (i, test) in tests.iter().enumerate() { if !seen.contains_key(test.name) { seen.insert(test.name, i); } else { let last = seen[test.name]; panic!( "{} tests have duplicate names at positions {} and {}", constname, last, i ); } } }; assert("BASICS", BASICS); assert("STANDARD", STANDARD); assert("LEFTMOST", LEFTMOST); assert("LEFTMOST_FIRST", LEFTMOST_FIRST); assert("LEFTMOST_LONGEST", LEFTMOST_LONGEST); assert("NON_OVERLAPPING", NON_OVERLAPPING); assert("OVERLAPPING", OVERLAPPING); assert("REGRESSION", REGRESSION); } #[test] #[should_panic] fn stream_not_allowed_leftmost_first() { let fsm = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostFirst) .build(None::); assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); } #[test] #[should_panic] fn stream_not_allowed_leftmost_longest() { let fsm = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) .build(None::); assert_eq!(fsm.stream_find_iter(&b""[..]).count(), 0); } #[test] #[should_panic] fn overlapping_not_allowed_leftmost_first() { let fsm = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostFirst) .build(None::); assert_eq!(fsm.find_overlapping_iter("").count(), 0); } #[test] #[should_panic] fn overlapping_not_allowed_leftmost_longest() { let fsm = AhoCorasickBuilder::new() .match_kind(MatchKind::LeftmostLongest) .build(None::); assert_eq!(fsm.find_overlapping_iter("").count(), 0); } #[test] fn state_id_too_small() { let mut patterns = vec![]; for c1 in (b'a'..b'z').map(|b| b as char) { for c2 in (b'a'..b'z').map(|b| b as char) { for c3 in (b'a'..b'z').map(|b| b as char) { patterns.push(format!("{}{}{}", c1, c2, c3)); } } } let result = AhoCorasickBuilder::new().build_with_size::(&patterns); assert!(result.is_err()); } // See: https://github.com/BurntSushi/aho-corasick/issues/44 // // In short, this test ensures that enabling ASCII case insensitivity does not // visit an exponential number of states when filling in failure transitions. #[test] fn regression_ascii_case_insensitive_no_exponential() { let ac = AhoCorasickBuilder::new() .ascii_case_insensitive(true) .build(&["Tsubaki House-Triple Shot Vol01校花三姐妹"]); assert!(ac.find("").is_none()); } fn run_search_tests Vec>( which: TestCollection, mut f: F, ) { let get_match_triples = |matches: Vec| -> Vec<(usize, usize, usize)> { matches .into_iter() .map(|m| (m.pattern(), m.start(), m.end())) .collect() }; for &tests in which { for test in tests { assert_eq!( test.matches, get_match_triples(f(&test)).as_slice(), "test: {}, patterns: {:?}, haystack: {:?}", test.name, test.patterns, test.haystack ); } } } aho-corasick-0.7.8/.cargo_vcs_info.json0000644000000001121361627455300135170ustar00{ "git": { "sha1": "192ab497c72fcbb885d8c553f2a74547b593b6f0" } }