grep-regex-0.1.12/.cargo_vcs_info.json0000644000000001520000000000100131600ustar { "git": { "sha1": "fee7ac79f11e05640c5609a825ed6d1359cae472" }, "path_in_vcs": "crates/regex" }grep-regex-0.1.12/Cargo.toml0000644000000022630000000000100111630ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "grep-regex" version = "0.1.12" authors = ["Andrew Gallant "] description = """ Use Rust's regex library with the 'grep' crate. """ homepage = "https://github.com/BurntSushi/ripgrep/tree/master/crates/regex" documentation = "https://docs.rs/grep-regex" readme = "README.md" keywords = [ "regex", "grep", "search", "pattern", "line", ] license = "Unlicense OR MIT" repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/regex" [dependencies.bstr] version = "1.6.2" [dependencies.grep-matcher] version = "0.1.7" [dependencies.log] version = "0.4.20" [dependencies.regex-automata] version = "0.4.0" [dependencies.regex-syntax] version = "0.8.0" grep-regex-0.1.12/Cargo.toml.orig000064400000000000000000000012351046102023000146420ustar 00000000000000[package] name = "grep-regex" version = "0.1.12" #:version authors = ["Andrew Gallant "] description = """ Use Rust's regex library with the 'grep' crate. """ documentation = "https://docs.rs/grep-regex" homepage = "https://github.com/BurntSushi/ripgrep/tree/master/crates/regex" repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/regex" readme = "README.md" keywords = ["regex", "grep", "search", "pattern", "line"] license = "Unlicense OR MIT" edition = "2021" [dependencies] bstr = "1.6.2" grep-matcher = { version = "0.1.7", path = "../matcher" } log = "0.4.20" regex-automata = { version = "0.4.0" } regex-syntax = "0.8.0" grep-regex-0.1.12/LICENSE-MIT000064400000000000000000000020711046102023000134060ustar 00000000000000The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. grep-regex-0.1.12/README.md000064400000000000000000000015201046102023000132270ustar 00000000000000grep-regex ---------- The `grep-regex` crate provides an implementation of the `Matcher` trait from the `grep-matcher` crate. This implementation permits Rust's regex engine to be used in the `grep` crate for fast line oriented searching. [![Build status](https://github.com/BurntSushi/ripgrep/workflows/ci/badge.svg)](https://github.com/BurntSushi/ripgrep/actions) [![](https://img.shields.io/crates/v/grep-regex.svg)](https://crates.io/crates/grep-regex) Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). ### Documentation [https://docs.rs/grep-regex](https://docs.rs/grep-regex) **NOTE:** You probably don't want to use this crate directly. Instead, you should prefer the facade defined in the [`grep`](https://docs.rs/grep) crate. ### Usage Add this to your `Cargo.toml`: ```toml [dependencies] grep-regex = "0.1" ``` grep-regex-0.1.12/UNLICENSE000064400000000000000000000022731046102023000132260ustar 00000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to grep-regex-0.1.12/src/ast.rs000064400000000000000000000142411046102023000137000ustar 00000000000000use regex_syntax::ast::{self, Ast}; /// The results of analyzing AST of a regular expression (e.g., for supporting /// smart case). #[derive(Clone, Debug)] pub(crate) struct AstAnalysis { /// True if and only if a literal uppercase character occurs in the regex. any_uppercase: bool, /// True if and only if the regex contains any literal at all. any_literal: bool, } impl AstAnalysis { /// Returns a `AstAnalysis` value by doing analysis on the AST of `pattern`. /// /// If `pattern` is not a valid regular expression, then `None` is /// returned. #[cfg(test)] pub(crate) fn from_pattern(pattern: &str) -> Option { regex_syntax::ast::parse::Parser::new() .parse(pattern) .map(|ast| AstAnalysis::from_ast(&ast)) .ok() } /// Perform an AST analysis given the AST. pub(crate) fn from_ast(ast: &Ast) -> AstAnalysis { let mut analysis = AstAnalysis::new(); analysis.from_ast_impl(ast); analysis } /// Returns true if and only if a literal uppercase character occurs in /// the pattern. /// /// For example, a pattern like `\pL` contains no uppercase literals, /// even though `L` is uppercase and the `\pL` class contains uppercase /// characters. pub(crate) fn any_uppercase(&self) -> bool { self.any_uppercase } /// Returns true if and only if the regex contains any literal at all. /// /// For example, a pattern like `\pL` reports `false`, but a pattern like /// `\pLfoo` reports `true`. pub(crate) fn any_literal(&self) -> bool { self.any_literal } /// Creates a new `AstAnalysis` value with an initial configuration. fn new() -> AstAnalysis { AstAnalysis { any_uppercase: false, any_literal: false } } fn from_ast_impl(&mut self, ast: &Ast) { if self.done() { return; } match *ast { Ast::Empty(_) => {} Ast::Flags(_) | Ast::Dot(_) | Ast::Assertion(_) | Ast::ClassUnicode(_) | Ast::ClassPerl(_) => {} Ast::Literal(ref x) => { self.from_ast_literal(x); } Ast::ClassBracketed(ref x) => { self.from_ast_class_set(&x.kind); } Ast::Repetition(ref x) => { self.from_ast_impl(&x.ast); } Ast::Group(ref x) => { self.from_ast_impl(&x.ast); } Ast::Alternation(ref alt) => { for x in &alt.asts { self.from_ast_impl(x); } } Ast::Concat(ref alt) => { for x in &alt.asts { self.from_ast_impl(x); } } } } fn from_ast_class_set(&mut self, ast: &ast::ClassSet) { if self.done() { return; } match *ast { ast::ClassSet::Item(ref item) => { self.from_ast_class_set_item(item); } ast::ClassSet::BinaryOp(ref x) => { self.from_ast_class_set(&x.lhs); self.from_ast_class_set(&x.rhs); } } } fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) { if self.done() { return; } match *ast { ast::ClassSetItem::Empty(_) | ast::ClassSetItem::Ascii(_) | ast::ClassSetItem::Unicode(_) | ast::ClassSetItem::Perl(_) => {} ast::ClassSetItem::Literal(ref x) => { self.from_ast_literal(x); } ast::ClassSetItem::Range(ref x) => { self.from_ast_literal(&x.start); self.from_ast_literal(&x.end); } ast::ClassSetItem::Bracketed(ref x) => { self.from_ast_class_set(&x.kind); } ast::ClassSetItem::Union(ref union) => { for x in &union.items { self.from_ast_class_set_item(x); } } } } fn from_ast_literal(&mut self, ast: &ast::Literal) { self.any_literal = true; self.any_uppercase = self.any_uppercase || ast.c.is_uppercase(); } /// Returns true if and only if the attributes can never change no matter /// what other AST it might see. fn done(&self) -> bool { self.any_uppercase && self.any_literal } } #[cfg(test)] mod tests { use super::*; fn analysis(pattern: &str) -> AstAnalysis { AstAnalysis::from_pattern(pattern).unwrap() } #[test] fn various() { let x = analysis(""); assert!(!x.any_uppercase); assert!(!x.any_literal); let x = analysis("foo"); assert!(!x.any_uppercase); assert!(x.any_literal); let x = analysis("Foo"); assert!(x.any_uppercase); assert!(x.any_literal); let x = analysis("foO"); assert!(x.any_uppercase); assert!(x.any_literal); let x = analysis(r"foo\\"); assert!(!x.any_uppercase); assert!(x.any_literal); let x = analysis(r"foo\w"); assert!(!x.any_uppercase); assert!(x.any_literal); let x = analysis(r"foo\S"); assert!(!x.any_uppercase); assert!(x.any_literal); let x = analysis(r"foo\p{Ll}"); assert!(!x.any_uppercase); assert!(x.any_literal); let x = analysis(r"foo[a-z]"); assert!(!x.any_uppercase); assert!(x.any_literal); let x = analysis(r"foo[A-Z]"); assert!(x.any_uppercase); assert!(x.any_literal); let x = analysis(r"foo[\S\t]"); assert!(!x.any_uppercase); assert!(x.any_literal); let x = analysis(r"foo\\S"); assert!(x.any_uppercase); assert!(x.any_literal); let x = analysis(r"\p{Ll}"); assert!(!x.any_uppercase); assert!(!x.any_literal); let x = analysis(r"aBc\w"); assert!(x.any_uppercase); assert!(x.any_literal); let x = analysis(r"a\u0061"); assert!(!x.any_uppercase); assert!(x.any_literal); } } grep-regex-0.1.12/src/ban.rs000064400000000000000000000051611046102023000136520ustar 00000000000000use regex_syntax::hir::{ self, ClassBytesRange, ClassUnicodeRange, Hir, HirKind, }; use crate::error::{Error, ErrorKind}; /// Returns an error when a sub-expression in `expr` must match `byte`. pub(crate) fn check(expr: &Hir, byte: u8) -> Result<(), Error> { assert!(byte.is_ascii(), "ban byte must be ASCII"); let ch = char::from(byte); let invalid = || Err(Error::new(ErrorKind::Banned(byte))); match expr.kind() { HirKind::Empty => {} HirKind::Literal(hir::Literal(ref lit)) => { if lit.iter().find(|&&b| b == byte).is_some() { return invalid(); } } HirKind::Class(hir::Class::Unicode(ref cls)) => { if cls.ranges().iter().map(|r| r.len()).sum::() == 1 { let contains = |r: &&ClassUnicodeRange| r.start() <= ch && ch <= r.end(); if cls.ranges().iter().find(contains).is_some() { return invalid(); } } } HirKind::Class(hir::Class::Bytes(ref cls)) => { if cls.ranges().iter().map(|r| r.len()).sum::() == 1 { let contains = |r: &&ClassBytesRange| { r.start() <= byte && byte <= r.end() }; if cls.ranges().iter().find(contains).is_some() { return invalid(); } } } HirKind::Look(_) => {} HirKind::Repetition(ref x) => check(&x.sub, byte)?, HirKind::Capture(ref x) => check(&x.sub, byte)?, HirKind::Concat(ref xs) => { for x in xs.iter() { check(x, byte)?; } } HirKind::Alternation(ref xs) => { for x in xs.iter() { check(x, byte)?; } } }; Ok(()) } #[cfg(test)] mod tests { use regex_syntax::Parser; /// Returns true when the given pattern is detected to contain the given /// banned byte. fn check(pattern: &str, byte: u8) -> bool { let hir = Parser::new().parse(pattern).unwrap(); super::check(&hir, byte).is_err() } #[test] fn various() { assert!(check(r"\x00", 0)); assert!(check(r"a\x00", 0)); assert!(check(r"\x00b", 0)); assert!(check(r"a\x00b", 0)); assert!(check(r"\x00|ab", 0)); assert!(check(r"ab|\x00", 0)); assert!(check(r"\x00?", 0)); assert!(check(r"(\x00)", 0)); assert!(check(r"[\x00]", 0)); assert!(check(r"[^[^\x00]]", 0)); assert!(!check(r"[^\x00]", 0)); assert!(!check(r"[\x00a]", 0)); } } grep-regex-0.1.12/src/config.rs000064400000000000000000000342601046102023000143610ustar 00000000000000use { grep_matcher::{ByteSet, LineTerminator}, regex_automata::meta::Regex, regex_syntax::{ ast, hir::{self, Hir}, }, }; use crate::{ ast::AstAnalysis, ban, error::Error, non_matching::non_matching_bytes, strip::strip_from_match, }; /// Config represents the configuration of a regex matcher in this crate. /// The configuration is itself a rough combination of the knobs found in /// the `regex` crate itself, along with additional `grep-matcher` specific /// options. /// /// The configuration can be used to build a "configured" HIR expression. A /// configured HIR expression is an HIR expression that is aware of the /// configuration which generated it, and provides transformation on that HIR /// such that the configuration is preserved. #[derive(Clone, Debug)] pub(crate) struct Config { pub(crate) case_insensitive: bool, pub(crate) case_smart: bool, pub(crate) multi_line: bool, pub(crate) dot_matches_new_line: bool, pub(crate) swap_greed: bool, pub(crate) ignore_whitespace: bool, pub(crate) unicode: bool, pub(crate) octal: bool, pub(crate) size_limit: usize, pub(crate) dfa_size_limit: usize, pub(crate) nest_limit: u32, pub(crate) line_terminator: Option, pub(crate) ban: Option, pub(crate) crlf: bool, pub(crate) word: bool, pub(crate) fixed_strings: bool, pub(crate) whole_line: bool, } impl Default for Config { fn default() -> Config { Config { case_insensitive: false, case_smart: false, multi_line: false, dot_matches_new_line: false, swap_greed: false, ignore_whitespace: false, unicode: true, octal: false, // These size limits are much bigger than what's in the regex // crate by default. size_limit: 100 * (1 << 20), dfa_size_limit: 1000 * (1 << 20), nest_limit: 250, line_terminator: None, ban: None, crlf: false, word: false, fixed_strings: false, whole_line: false, } } } impl Config { /// Use this configuration to build an HIR from the given patterns. The HIR /// returned corresponds to a single regex that is an alternation of the /// patterns given. pub(crate) fn build_many>( &self, patterns: &[P], ) -> Result { ConfiguredHIR::new(self.clone(), patterns) } /// Accounting for the `smart_case` config knob, return true if and only if /// this pattern should be matched case insensitively. fn is_case_insensitive(&self, analysis: &AstAnalysis) -> bool { if self.case_insensitive { return true; } if !self.case_smart { return false; } analysis.any_literal() && !analysis.any_uppercase() } /// Returns whether the given patterns should be treated as "fixed strings" /// literals. This is different from just querying the `fixed_strings` knob /// in that if the knob is false, this will still return true in some cases /// if the patterns are themselves indistinguishable from literals. /// /// The main idea here is that if this returns true, then it is safe /// to build an `regex_syntax::hir::Hir` value directly from the given /// patterns as an alternation of `hir::Literal` values. fn is_fixed_strings>(&self, patterns: &[P]) -> bool { // When these are enabled, we really need to parse the patterns and // let them go through the standard HIR translation process in order // for case folding transforms to be applied. if self.case_insensitive || self.case_smart { return false; } // Even if whole_line or word is enabled, both of those things can // be implemented by wrapping the Hir generated by an alternation of // fixed string literals. So for here at least, we don't care about the // word or whole_line settings. if self.fixed_strings { // ... but if any literal contains a line terminator, then we've // got to bail out because this will ultimately result in an error. if let Some(lineterm) = self.line_terminator { for p in patterns.iter() { if has_line_terminator(lineterm, p.as_ref()) { return false; } } } return true; } // In this case, the only way we can hand construct the Hir is if none // of the patterns contain meta characters. If they do, then we need to // send them through the standard parsing/translation process. for p in patterns.iter() { let p = p.as_ref(); if p.chars().any(regex_syntax::is_meta_character) { return false; } // Same deal as when fixed_strings is set above. If the pattern has // a line terminator anywhere, then we need to bail out and let // an error occur. if let Some(lineterm) = self.line_terminator { if has_line_terminator(lineterm, p) { return false; } } } true } } /// A "configured" HIR expression, which is aware of the configuration which /// produced this HIR. /// /// Since the configuration is tracked, values with this type can be /// transformed into other HIR expressions (or regular expressions) in a way /// that preserves the configuration. For example, the `fast_line_regex` /// method will apply literal extraction to the inner HIR and use that to build /// a new regex that matches the extracted literals in a way that is /// consistent with the configuration that produced this HIR. For example, the /// size limits set on the configured HIR will be propagated out to any /// subsequently constructed HIR or regular expression. #[derive(Clone, Debug)] pub(crate) struct ConfiguredHIR { config: Config, hir: Hir, } impl ConfiguredHIR { /// Parse the given patterns into a single HIR expression that represents /// an alternation of the patterns given. fn new>( config: Config, patterns: &[P], ) -> Result { let hir = if config.is_fixed_strings(patterns) { let mut alts = vec![]; for p in patterns.iter() { alts.push(Hir::literal(p.as_ref().as_bytes())); } log::debug!( "assembling HIR from {} fixed string literals", alts.len() ); let hir = Hir::alternation(alts); hir } else { let mut alts = vec![]; for p in patterns.iter() { alts.push(if config.fixed_strings { format!("(?:{})", regex_syntax::escape(p.as_ref())) } else { format!("(?:{})", p.as_ref()) }); } let pattern = alts.join("|"); let ast = ast::parse::ParserBuilder::new() .nest_limit(config.nest_limit) .octal(config.octal) .ignore_whitespace(config.ignore_whitespace) .build() .parse(&pattern) .map_err(Error::generic)?; let analysis = AstAnalysis::from_ast(&ast); let mut hir = hir::translate::TranslatorBuilder::new() .utf8(false) .case_insensitive(config.is_case_insensitive(&analysis)) .multi_line(config.multi_line) .dot_matches_new_line(config.dot_matches_new_line) .crlf(config.crlf) .swap_greed(config.swap_greed) .unicode(config.unicode) .build() .translate(&pattern, &ast) .map_err(Error::generic)?; if let Some(byte) = config.ban { ban::check(&hir, byte)?; } // We don't need to do this for the fixed-strings case above // because is_fixed_strings will return false if any pattern // contains a line terminator. Therefore, we don't need to strip // it. // // We go to some pains to avoid doing this in the fixed-strings // case because this can result in building a new HIR when ripgrep // is given a huge set of literals to search for. And this can // actually take a little time. It's not huge, but it's noticeable. hir = match config.line_terminator { None => hir, Some(line_term) => strip_from_match(hir, line_term)?, }; hir }; Ok(ConfiguredHIR { config, hir }) } /// Return a reference to the underlying configuration. pub(crate) fn config(&self) -> &Config { &self.config } /// Return a reference to the underyling HIR. pub(crate) fn hir(&self) -> &Hir { &self.hir } /// Convert this HIR to a regex that can be used for matching. pub(crate) fn to_regex(&self) -> Result { let meta = Regex::config() .utf8_empty(false) .nfa_size_limit(Some(self.config.size_limit)) // We don't expose a knob for this because the one-pass DFA is // usually not a perf bottleneck for ripgrep. But we give it some // extra room than the default. .onepass_size_limit(Some(10 * (1 << 20))) // Same deal here. The default limit for full DFAs is VERY small, // but with ripgrep we can afford to spend a bit more time on // building them I think. .dfa_size_limit(Some(1 * (1 << 20))) .dfa_state_limit(Some(1_000)) .hybrid_cache_capacity(self.config.dfa_size_limit); Regex::builder() .configure(meta) .build_from_hir(&self.hir) .map_err(Error::regex) } /// Compute the set of non-matching bytes for this HIR expression. pub(crate) fn non_matching_bytes(&self) -> ByteSet { non_matching_bytes(&self.hir) } /// Returns the line terminator configured on this expression. /// /// When we have beginning/end anchors (NOT line anchors), the fast line /// searching path isn't quite correct. Or at least, doesn't match the slow /// path. Namely, the slow path strips line terminators while the fast path /// does not. Since '$' (when multi-line mode is disabled) doesn't match at /// line boundaries, the existence of a line terminator might cause it to /// not match when it otherwise would with the line terminator stripped. /// /// Since searching with text anchors is exceptionally rare in the context /// of line oriented searching (multi-line mode is basically always /// enabled), we just disable this optimization when there are text /// anchors. We disable it by not returning a line terminator, since /// without a line terminator, the fast search path can't be executed. /// /// Actually, the above is no longer quite correct. Later on, another /// optimization was added where if the line terminator was in the set of /// bytes that was guaranteed to never be part of a match, then the higher /// level search infrastructure assumes that the fast line-by-line search /// path can still be taken. This optimization applies when multi-line /// search (not multi-line mode) is enabled. In that case, there is no /// configured line terminator since the regex is permitted to match a /// line terminator. But if the regex is guaranteed to never match across /// multiple lines despite multi-line search being requested, we can still /// do the faster and more flexible line-by-line search. This is why the /// non-matching extraction routine removes `\n` when `\A` and `\z` are /// present even though that's not quite correct... /// /// See: pub(crate) fn line_terminator(&self) -> Option { if self.hir.properties().look_set().contains_anchor_haystack() { None } else { self.config.line_terminator } } /// Turns this configured HIR into an equivalent one, but where it must /// match at the start and end of a line. pub(crate) fn into_whole_line(self) -> ConfiguredHIR { let line_anchor_start = Hir::look(self.line_anchor_start()); let line_anchor_end = Hir::look(self.line_anchor_end()); let hir = Hir::concat(vec![line_anchor_start, self.hir, line_anchor_end]); ConfiguredHIR { config: self.config, hir } } /// Turns this configured HIR into an equivalent one, but where it must /// match at word boundaries. pub(crate) fn into_word(self) -> ConfiguredHIR { let hir = Hir::concat(vec![ Hir::look(if self.config.unicode { hir::Look::WordStartHalfUnicode } else { hir::Look::WordStartHalfAscii }), self.hir, Hir::look(if self.config.unicode { hir::Look::WordEndHalfUnicode } else { hir::Look::WordEndHalfAscii }), ]); ConfiguredHIR { config: self.config, hir } } /// Returns the "start line" anchor for this configuration. fn line_anchor_start(&self) -> hir::Look { if self.config.crlf { hir::Look::StartCRLF } else { hir::Look::StartLF } } /// Returns the "end line" anchor for this configuration. fn line_anchor_end(&self) -> hir::Look { if self.config.crlf { hir::Look::EndCRLF } else { hir::Look::EndLF } } } /// Returns true if the given literal string contains any byte from the line /// terminator given. fn has_line_terminator(lineterm: LineTerminator, literal: &str) -> bool { if lineterm.is_crlf() { literal.as_bytes().iter().copied().any(|b| b == b'\r' || b == b'\n') } else { literal.as_bytes().iter().copied().any(|b| b == lineterm.as_byte()) } } grep-regex-0.1.12/src/error.rs000064400000000000000000000061571046102023000142510ustar 00000000000000/// An error that can occur in this crate. /// /// Generally, this error corresponds to problems building a regular /// expression, whether it's in parsing, compilation or a problem with /// guaranteeing a configured optimization. #[derive(Clone, Debug)] pub struct Error { kind: ErrorKind, } impl Error { pub(crate) fn new(kind: ErrorKind) -> Error { Error { kind } } pub(crate) fn regex(err: regex_automata::meta::BuildError) -> Error { if let Some(size_limit) = err.size_limit() { let kind = ErrorKind::Regex(format!( "compiled regex exceeds size limit of {size_limit}", )); Error { kind } } else if let Some(ref err) = err.syntax_error() { Error::generic(err) } else { Error::generic(err) } } pub(crate) fn generic(err: E) -> Error { Error { kind: ErrorKind::Regex(err.to_string()) } } /// Return the kind of this error. pub fn kind(&self) -> &ErrorKind { &self.kind } } /// The kind of an error that can occur. #[derive(Clone, Debug)] #[non_exhaustive] pub enum ErrorKind { /// An error that occurred as a result of parsing a regular expression. /// This can be a syntax error or an error that results from attempting to /// compile a regular expression that is too big. /// /// The string here is the underlying error converted to a string. Regex(String), /// An error that occurs when a building a regex that isn't permitted to /// match a line terminator. In general, building the regex will do its /// best to make matching a line terminator impossible (e.g., by removing /// `\n` from the `\s` character class), but if the regex contains a /// `\n` literal, then there is no reasonable choice that can be made and /// therefore an error is reported. /// /// The string is the literal sequence found in the regex that is not /// allowed. NotAllowed(String), /// This error occurs when a non-ASCII line terminator was provided. /// /// The invalid byte is included in this error. InvalidLineTerminator(u8), /// Occurs when a banned byte was found in a pattern. Banned(u8), } impl std::error::Error for Error {} impl std::fmt::Display for Error { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { use bstr::ByteSlice; match self.kind { ErrorKind::Regex(ref s) => write!(f, "{}", s), ErrorKind::NotAllowed(ref lit) => { write!(f, "the literal {:?} is not allowed in a regex", lit) } ErrorKind::InvalidLineTerminator(byte) => { write!( f, "line terminators must be ASCII, but {byte:?} is not", byte = [byte].as_bstr(), ) } ErrorKind::Banned(byte) => { write!( f, "pattern contains {byte:?} but it is impossible to match", byte = [byte].as_bstr(), ) } } } } grep-regex-0.1.12/src/lib.rs000064400000000000000000000005031046102023000136530ustar 00000000000000/*! An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine. */ #![deny(missing_docs)] pub use crate::{ error::{Error, ErrorKind}, matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder}, }; mod ast; mod ban; mod config; mod error; mod literal; mod matcher; mod non_matching; mod strip; grep-regex-0.1.12/src/literal.rs000064400000000000000000001115061046102023000145470ustar 00000000000000use { regex_automata::meta::Regex, regex_syntax::hir::{ self, literal::{Literal, Seq}, Hir, }, }; use crate::{config::ConfiguredHIR, error::Error}; /// A type that encapsulates "inner" literal extractiong from a regex. /// /// It uses a huge pile of heuristics to try to pluck out literals from a regex /// that are in turn used to build a simpler regex that is more amenable to /// optimization. /// /// The main idea underyling the validity of this technique is the fact /// that ripgrep searches individuals lines and not across lines. (Unless /// -U/--multiline is enabled.) Namely, we can pluck literals out of the regex, /// search for them, find the bounds of the line in which that literal occurs /// and then run the original regex on only that line. This overall works /// really really well in throughput oriented searches because it potentially /// allows ripgrep to spend a lot more time in a fast vectorized routine for /// finding literals as opposed to the (much) slower regex engine. /// /// This optimization was far more important in the old days, but since then, /// Rust's regex engine has actually grown its own (albeit limited) support for /// inner literal optimizations. So this technique doesn't apply as much as it /// used to. /// /// A good example of a regex where this particular extractor helps is /// `\s+(Sherlock|[A-Z]atso[a-z]|Moriarty)\s+`. The `[A-Z]` before the `atso` /// in particular is what inhibits the regex engine's own inner literal /// optimizations from kicking in. This particular regex also did not have any /// inner literals extracted in the old implementation (ripgrep <=13). So this /// particular implementation represents a strict improvement from both the old /// implementation and from the regex engine's own optimizations. (Which could /// in theory be improved still.) #[derive(Clone, Debug)] pub(crate) struct InnerLiterals { seq: Seq, } impl InnerLiterals { /// Create a set of inner literals from the given HIR expression. /// /// If no line terminator was configured, then this always declines to /// extract literals because the inner literal optimization may not be /// valid. /// /// Note that this requires the actual regex that will be used for a search /// because it will query some state about the compiled regex. That state /// may influence inner literal extraction. pub(crate) fn new(chir: &ConfiguredHIR, re: &Regex) -> InnerLiterals { // If there's no line terminator, then the inner literal optimization // at this level is not valid. if chir.config().line_terminator.is_none() { log::trace!( "skipping inner literal extraction, \ no line terminator is set" ); return InnerLiterals::none(); } // If we believe the regex is already accelerated, then just let // the regex engine do its thing. We'll skip the inner literal // optimization. // // ... but only if the regex doesn't have any Unicode word boundaries. // If it does, there's enough of a chance of the regex engine falling // back to a slower engine that it's worth trying our own inner literal // optimization. if re.is_accelerated() { if !chir.hir().properties().look_set().contains_word_unicode() { log::trace!( "skipping inner literal extraction, \ existing regex is believed to already be accelerated", ); return InnerLiterals::none(); } } // In this case, we pretty much know that the regex engine will handle // it as best as possible, even if it isn't reported as accelerated. if chir.hir().properties().is_alternation_literal() { log::trace!( "skipping inner literal extraction, \ found alternation of literals, deferring to regex engine", ); return InnerLiterals::none(); } let seq = Extractor::new().extract_untagged(chir.hir()); InnerLiterals { seq } } /// Returns a infinite set of inner literals, such that it can never /// produce a matcher. pub(crate) fn none() -> InnerLiterals { InnerLiterals { seq: Seq::infinite() } } /// If it is deemed advantageous to do so (via various suspicious /// heuristics), this will return a single regular expression pattern that /// matches a subset of the language matched by the regular expression that /// generated these literal sets. The idea here is that the pattern /// returned by this method is much cheaper to search for. i.e., It is /// usually a single literal or an alternation of literals. pub(crate) fn one_regex(&self) -> Result, Error> { let Some(lits) = self.seq.literals() else { return Ok(None) }; if lits.is_empty() { return Ok(None); } let mut alts = vec![]; for lit in lits.iter() { alts.push(Hir::literal(lit.as_bytes())); } let hir = Hir::alternation(alts); log::debug!("extracted fast line regex: {:?}", hir.to_string()); let re = Regex::builder() .configure(Regex::config().utf8_empty(false)) .build_from_hir(&hir) .map_err(Error::regex)?; Ok(Some(re)) } } /// An inner literal extractor. /// /// This is a somewhat stripped down version of the extractor from /// regex-syntax. The main difference is that we try to identify a "best" set /// of required literals while traversing the HIR. #[derive(Debug)] struct Extractor { limit_class: usize, limit_repeat: usize, limit_literal_len: usize, limit_total: usize, } impl Extractor { /// Create a new inner literal extractor with a default configuration. fn new() -> Extractor { Extractor { limit_class: 10, limit_repeat: 10, limit_literal_len: 100, limit_total: 64, } } /// Execute the extractor at the top-level and return an untagged sequence /// of literals. fn extract_untagged(&self, hir: &Hir) -> Seq { let mut seq = self.extract(hir); log::trace!("extracted inner literals: {:?}", seq.seq); seq.seq.optimize_for_prefix_by_preference(); log::trace!( "extracted inner literals after optimization: {:?}", seq.seq ); if !seq.is_good() { log::trace!( "throwing away inner literals because they might be slow" ); seq.make_infinite(); } seq.seq } /// Execute the extractor and return a sequence of literals. fn extract(&self, hir: &Hir) -> TSeq { use regex_syntax::hir::HirKind::*; match *hir.kind() { Empty | Look(_) => TSeq::singleton(self::Literal::exact(vec![])), Literal(hir::Literal(ref bytes)) => { let mut seq = TSeq::singleton(self::Literal::exact(bytes.to_vec())); self.enforce_literal_len(&mut seq); seq } Class(hir::Class::Unicode(ref cls)) => { self.extract_class_unicode(cls) } Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), Repetition(ref rep) => self.extract_repetition(rep), Capture(hir::Capture { ref sub, .. }) => self.extract(sub), Concat(ref hirs) => self.extract_concat(hirs.iter()), Alternation(ref hirs) => self.extract_alternation(hirs.iter()), } } /// Extract a sequence from the given concatenation. Sequences from each of /// the child HIR expressions are combined via cross product. /// /// This short circuits once the cross product turns into a sequence /// containing only inexact literals. fn extract_concat<'a, I: Iterator>(&self, it: I) -> TSeq { let mut seq = TSeq::singleton(self::Literal::exact(vec![])); let mut prev: Option = None; for hir in it { // If every element in the sequence is inexact, then a cross // product will always be a no-op. Thus, there is nothing else we // can add to it and can quit early. Note that this also includes // infinite sequences. if seq.is_inexact() { // If a concatenation has an empty sequence anywhere, then // it's impossible for the concatenantion to ever match. So we // can just quit now. if seq.is_empty() { return seq; } if seq.is_really_good() { return seq; } prev = Some(match prev { None => seq, Some(prev) => prev.choose(seq), }); seq = TSeq::singleton(self::Literal::exact(vec![])); seq.make_not_prefix(); } // Note that 'cross' also dispatches based on whether we're // extracting prefixes or suffixes. seq = self.cross(seq, self.extract(hir)); } if let Some(prev) = prev { prev.choose(seq) } else { seq } } /// Extract a sequence from the given alternation. /// /// This short circuits once the union turns into an infinite sequence. fn extract_alternation<'a, I: Iterator>( &self, it: I, ) -> TSeq { let mut seq = TSeq::empty(); for hir in it { // Once our 'seq' is infinite, every subsequent union // operation on it will itself always result in an // infinite sequence. Thus, it can never change and we can // short-circuit. if !seq.is_finite() { break; } seq = self.union(seq, &mut self.extract(hir)); } seq } /// Extract a sequence of literals from the given repetition. We do our /// best, Some examples: /// /// 'a*' => [inexact(a), exact("")] /// 'a*?' => [exact(""), inexact(a)] /// 'a+' => [inexact(a)] /// 'a{3}' => [exact(aaa)] /// 'a{3,5} => [inexact(aaa)] /// /// The key here really is making sure we get the 'inexact' vs 'exact' /// attributes correct on each of the literals we add. For example, the /// fact that 'a*' gives us an inexact 'a' and an exact empty string means /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)] /// literals being extracted, which might actually be a better prefilter /// than just 'a'. fn extract_repetition(&self, rep: &hir::Repetition) -> TSeq { let mut subseq = self.extract(&rep.sub); match *rep { hir::Repetition { min: 0, max, greedy, .. } => { // When 'max=1', we can retain exactness, since 'a?' is // equivalent to 'a|'. Similarly below, 'a??' is equivalent to // '|a'. if max != Some(1) { subseq.make_inexact(); } let mut empty = TSeq::singleton(Literal::exact(vec![])); if !greedy { std::mem::swap(&mut subseq, &mut empty); } self.union(subseq, &mut empty) } hir::Repetition { min, max: Some(max), .. } if min == max => { assert!(min > 0); // handled above let limit = u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); let mut seq = TSeq::singleton(Literal::exact(vec![])); for _ in 0..std::cmp::min(min, limit) { if seq.is_inexact() { break; } seq = self.cross(seq, subseq.clone()); } if usize::try_from(min).is_err() || min > limit { seq.make_inexact(); } seq } hir::Repetition { min, max: Some(max), .. } if min < max => { assert!(min > 0); // handled above let limit = u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); let mut seq = TSeq::singleton(Literal::exact(vec![])); for _ in 0..std::cmp::min(min, limit) { if seq.is_inexact() { break; } seq = self.cross(seq, subseq.clone()); } seq.make_inexact(); seq } hir::Repetition { .. } => { subseq.make_inexact(); subseq } } } /// Convert the given Unicode class into a sequence of literals if the /// class is small enough. If the class is too big, return an infinite /// sequence. fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> TSeq { if self.class_over_limit_unicode(cls) { return TSeq::infinite(); } let mut seq = TSeq::empty(); for r in cls.iter() { for ch in r.start()..=r.end() { seq.push(Literal::from(ch)); } } self.enforce_literal_len(&mut seq); seq } /// Convert the given byte class into a sequence of literals if the class /// is small enough. If the class is too big, return an infinite sequence. fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> TSeq { if self.class_over_limit_bytes(cls) { return TSeq::infinite(); } let mut seq = TSeq::empty(); for r in cls.iter() { for b in r.start()..=r.end() { seq.push(Literal::from(b)); } } self.enforce_literal_len(&mut seq); seq } /// Returns true if the given Unicode class exceeds the configured limits /// on this extractor. fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool { let mut count = 0; for r in cls.iter() { if count > self.limit_class { return true; } count += r.len(); } count > self.limit_class } /// Returns true if the given byte class exceeds the configured limits on /// this extractor. fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool { let mut count = 0; for r in cls.iter() { if count > self.limit_class { return true; } count += r.len(); } count > self.limit_class } /// Compute the cross product of the two sequences if the result would be /// within configured limits. Otherwise, make `seq2` infinite and cross the /// infinite sequence with `seq1`. fn cross(&self, mut seq1: TSeq, mut seq2: TSeq) -> TSeq { if !seq2.prefix { return seq1.choose(seq2); } if seq1 .max_cross_len(&seq2) .map_or(false, |len| len > self.limit_total) { seq2.make_infinite(); } seq1.cross_forward(&mut seq2); assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); self.enforce_literal_len(&mut seq1); seq1 } /// Union the two sequences if the result would be within configured /// limits. Otherwise, make `seq2` infinite and union the infinite sequence /// with `seq1`. fn union(&self, mut seq1: TSeq, seq2: &mut TSeq) -> TSeq { if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) { // We try to trim our literal sequences to see if we can make // room for more literals. The idea is that we'd rather trim down // literals already in our sequence if it means we can add a few // more and retain a finite sequence. Otherwise, we'll union with // an infinite sequence and that infects everything and effectively // stops literal extraction in its tracks. // // We do we keep 4 bytes here? Well, it's a bit of an abstraction // leakage. Downstream, the literals may wind up getting fed to // the Teddy algorithm, which supports searching literals up to // length 4. So that's why we pick that number here. Arguably this // should be a tuneable parameter, but it seems a little tricky to // describe. And I'm still unsure if this is the right way to go // about culling literal sequences. seq1.keep_first_bytes(4); seq2.keep_first_bytes(4); seq1.dedup(); seq2.dedup(); if seq1 .max_union_len(seq2) .map_or(false, |len| len > self.limit_total) { seq2.make_infinite(); } } seq1.union(seq2); assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); seq1 } /// Applies the literal length limit to the given sequence. If none of the /// literals in the sequence exceed the limit, then this is a no-op. fn enforce_literal_len(&self, seq: &mut TSeq) { seq.keep_first_bytes(self.limit_literal_len); } } #[derive(Clone, Debug)] struct TSeq { seq: Seq, prefix: bool, } #[allow(dead_code)] impl TSeq { fn empty() -> TSeq { TSeq { seq: Seq::empty(), prefix: true } } fn infinite() -> TSeq { TSeq { seq: Seq::infinite(), prefix: true } } fn singleton(lit: Literal) -> TSeq { TSeq { seq: Seq::singleton(lit), prefix: true } } fn new(it: I) -> TSeq where I: IntoIterator, B: AsRef<[u8]>, { TSeq { seq: Seq::new(it), prefix: true } } fn literals(&self) -> Option<&[Literal]> { self.seq.literals() } fn push(&mut self, lit: Literal) { self.seq.push(lit); } fn make_inexact(&mut self) { self.seq.make_inexact(); } fn make_infinite(&mut self) { self.seq.make_infinite(); } fn cross_forward(&mut self, other: &mut TSeq) { assert!(other.prefix); self.seq.cross_forward(&mut other.seq); } fn union(&mut self, other: &mut TSeq) { self.seq.union(&mut other.seq); } fn dedup(&mut self) { self.seq.dedup(); } fn sort(&mut self) { self.seq.sort(); } fn keep_first_bytes(&mut self, len: usize) { self.seq.keep_first_bytes(len); } fn is_finite(&self) -> bool { self.seq.is_finite() } fn is_empty(&self) -> bool { self.seq.is_empty() } fn len(&self) -> Option { self.seq.len() } fn is_exact(&self) -> bool { self.seq.is_exact() } fn is_inexact(&self) -> bool { self.seq.is_inexact() } fn max_union_len(&self, other: &TSeq) -> Option { self.seq.max_union_len(&other.seq) } fn max_cross_len(&self, other: &TSeq) -> Option { assert!(other.prefix); self.seq.max_cross_len(&other.seq) } fn min_literal_len(&self) -> Option { self.seq.min_literal_len() } fn max_literal_len(&self) -> Option { self.seq.max_literal_len() } // Below are methods specific to a TSeq that aren't just forwarding calls // to a Seq method. /// Tags this sequence as "not a prefix." When this happens, this sequence /// can't be crossed as a suffix of another sequence. fn make_not_prefix(&mut self) { self.prefix = false; } /// Returns true if it's believed that the sequence given is "good" for /// acceleration. This is useful for determining whether a sequence of /// literals has any shot of being fast. fn is_good(&self) -> bool { if self.has_poisonous_literal() { return false; } let Some(min) = self.min_literal_len() else { return false }; let Some(len) = self.len() else { return false }; // If we have some very short literals, then let's require that our // sequence is itself very small. if min <= 1 { return len <= 3; } min >= 2 && len <= 64 } /// Returns true if it's believed that the sequence given is "really /// good" for acceleration. This is useful for short circuiting literal /// extraction. fn is_really_good(&self) -> bool { if self.has_poisonous_literal() { return false; } let Some(min) = self.min_literal_len() else { return false }; let Some(len) = self.len() else { return false }; min >= 3 && len <= 8 } /// Returns true if the given sequence contains a poisonous literal. fn has_poisonous_literal(&self) -> bool { let Some(lits) = self.literals() else { return false }; lits.iter().any(is_poisonous) } /// Compare the two sequences and return the one that is believed to be best /// according to a hodge podge of heuristics. fn choose(self, other: TSeq) -> TSeq { let (seq1, seq2) = (self, other); if !seq1.is_finite() { return seq2; } else if !seq2.is_finite() { return seq1; } if seq1.has_poisonous_literal() { return seq2; } else if seq2.has_poisonous_literal() { return seq1; } let Some(min1) = seq1.min_literal_len() else { return seq2 }; let Some(min2) = seq2.min_literal_len() else { return seq1 }; if min1 < min2 { return seq2; } else if min2 < min1 { return seq1; } // OK because we know both sequences are finite, otherwise they wouldn't // have a minimum literal length. let len1 = seq1.len().unwrap(); let len2 = seq2.len().unwrap(); if len1 < len2 { return seq2; } else if len2 < len1 { return seq1; } // We could do extra stuff like looking at a background frequency // distribution of bytes and picking the one that looks more rare, but for // now we just pick one. seq1 } } impl FromIterator for TSeq { fn from_iter>(it: T) -> TSeq { TSeq { seq: Seq::from_iter(it), prefix: true } } } /// Returns true if it is believe that this literal is likely to match very /// frequently, and is thus not a good candidate for a prefilter. fn is_poisonous(lit: &Literal) -> bool { use regex_syntax::hir::literal::rank; lit.is_empty() || (lit.len() == 1 && rank(lit.as_bytes()[0]) >= 250) } #[cfg(test)] mod tests { use super::*; fn e(pattern: impl AsRef) -> Seq { let pattern = pattern.as_ref(); let hir = regex_syntax::ParserBuilder::new() .utf8(false) .build() .parse(pattern) .unwrap(); Extractor::new().extract_untagged(&hir) } #[allow(non_snake_case)] fn E(x: &str) -> Literal { Literal::exact(x.as_bytes()) } #[allow(non_snake_case)] fn I(x: &str) -> Literal { Literal::inexact(x.as_bytes()) } fn seq>(it: I) -> Seq { Seq::from_iter(it) } fn inexact(it: I) -> Seq where I: IntoIterator, { Seq::from_iter(it) } fn exact, I: IntoIterator>(it: I) -> Seq { Seq::new(it) } #[test] fn various() { assert_eq!(e(r"foo"), seq([E("foo")])); assert_eq!(e(r"[a-z]foo[a-z]"), seq([I("foo")])); assert_eq!(e(r"[a-z](foo)(bar)[a-z]"), seq([I("foobar")])); assert_eq!(e(r"[a-z]([a-z]foo)(bar[a-z])[a-z]"), seq([I("foobar")])); assert_eq!(e(r"[a-z]([a-z]foo)([a-z]foo)[a-z]"), seq([I("foo")])); assert_eq!(e(r"(\d{1,3}\.){3}\d{1,3}"), seq([I(".")])); assert_eq!(e(r"[a-z]([a-z]foo){3}[a-z]"), seq([I("foo")])); assert_eq!(e(r"[a-z](foo[a-z]){3}[a-z]"), seq([I("foo")])); assert_eq!(e(r"[a-z]([a-z]foo[a-z]){3}[a-z]"), seq([I("foo")])); assert_eq!( e(r"[a-z]([a-z]foo){3}(bar[a-z]){3}[a-z]"), seq([I("foobar")]) ); } // These test that some of our suspicious heuristics try to "pick better // literals." #[test] fn heuristics() { // Here, the first literals we stumble across are {ab, cd, ef}. But we // keep going and our heuristics decide that {hiya} is better. (And it // should be, since it's just one literal and it's longer.) assert_eq!(e(r"[a-z]+(ab|cd|ef)[a-z]+hiya[a-z]+"), seq([I("hiya")])); // But here, the first alternation becomes "good enough" that literal // extraction short circuits early. {hiya} is probably still a better // choice here, but {abc, def, ghi} is not bad. assert_eq!( e(r"[a-z]+(abc|def|ghi)[a-z]+hiya[a-z]+"), seq([I("abc"), I("def"), I("ghi")]) ); } #[test] fn literal() { assert_eq!(exact(["a"]), e("a")); assert_eq!(exact(["aaaaa"]), e("aaaaa")); assert_eq!(exact(["A", "a"]), e("(?i-u)a")); assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab")); assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c")); assert_eq!(Seq::infinite(), e(r"(?-u:\xFF)")); assert_eq!(exact([b"Z"]), e(r"Z")); assert_eq!(exact(["☃"]), e("☃")); assert_eq!(exact(["☃"]), e("(?i)☃")); assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃")); assert_eq!(exact(["Δ"]), e("Δ")); assert_eq!(exact(["δ"]), e("δ")); assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ")); assert_eq!(exact(["Δ", "δ"]), e("(?i)δ")); assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S")); assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s")); assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ")); let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ"; assert_eq!(exact([letters]), e(letters)); } #[test] fn class() { assert_eq!(exact(["a", "b", "c"]), e("[abc]")); assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b")); assert_eq!(exact(["δ", "ε"]), e("[εδ]")); assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]")); } #[test] fn look() { assert_eq!(exact(["ab"]), e(r"a\Ab")); assert_eq!(exact(["ab"]), e(r"a\zb")); assert_eq!(exact(["ab"]), e(r"a(?m:^)b")); assert_eq!(exact(["ab"]), e(r"a(?m:$)b")); assert_eq!(exact(["ab"]), e(r"a\bb")); assert_eq!(exact(["ab"]), e(r"a\Bb")); assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b")); assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b")); assert_eq!(exact(["ab"]), e(r"^ab")); assert_eq!(exact(["ab"]), e(r"$ab")); assert_eq!(exact(["ab"]), e(r"(?m:^)ab")); assert_eq!(exact(["ab"]), e(r"(?m:$)ab")); assert_eq!(exact(["ab"]), e(r"\bab")); assert_eq!(exact(["ab"]), e(r"\Bab")); assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab")); assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab")); assert_eq!(exact(["ab"]), e(r"ab^")); assert_eq!(exact(["ab"]), e(r"ab$")); assert_eq!(exact(["ab"]), e(r"ab(?m:^)")); assert_eq!(exact(["ab"]), e(r"ab(?m:$)")); assert_eq!(exact(["ab"]), e(r"ab\b")); assert_eq!(exact(["ab"]), e(r"ab\B")); assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)")); assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)")); assert_eq!(seq([I("aZ"), E("ab")]), e(r"^aZ*b")); } #[test] fn repetition() { assert_eq!(Seq::infinite(), e(r"a?")); assert_eq!(Seq::infinite(), e(r"a??")); assert_eq!(Seq::infinite(), e(r"a*")); assert_eq!(Seq::infinite(), e(r"a*?")); assert_eq!(inexact([I("a")]), e(r"a+")); assert_eq!(inexact([I("a")]), e(r"(a+)+")); assert_eq!(exact(["ab"]), e(r"aZ{0}b")); assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b")); assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b")); assert_eq!(inexact([I("aZ"), E("ab")]), e(r"aZ*b")); assert_eq!(inexact([E("ab"), I("aZ")]), e(r"aZ*?b")); assert_eq!(inexact([I("aZ")]), e(r"aZ+b")); assert_eq!(inexact([I("aZ")]), e(r"aZ+?b")); assert_eq!(exact(["aZZb"]), e(r"aZ{2}b")); assert_eq!(inexact([I("aZZ")]), e(r"aZ{2,3}b")); assert_eq!(Seq::infinite(), e(r"(abc)?")); assert_eq!(Seq::infinite(), e(r"(abc)??")); assert_eq!(inexact([I("a"), E("b")]), e(r"a*b")); assert_eq!(inexact([E("b"), I("a")]), e(r"a*?b")); assert_eq!(inexact([I("ab")]), e(r"ab+")); assert_eq!(inexact([I("a"), I("b")]), e(r"a*b+")); assert_eq!(inexact([I("a"), I("b"), E("c")]), e(r"a*b*c")); assert_eq!(inexact([I("a"), I("b"), E("c")]), e(r"(a+)?(b+)?c")); assert_eq!(inexact([I("a"), I("b"), E("c")]), e(r"(a+|)(b+|)c")); // A few more similarish but not identical regexes. These may have a // similar problem as above. assert_eq!(Seq::infinite(), e(r"a*b*c*")); assert_eq!(inexact([I("a"), I("b"), I("c")]), e(r"a*b*c+")); assert_eq!(inexact([I("a"), I("b")]), e(r"a*b+c")); assert_eq!(inexact([I("a"), I("b")]), e(r"a*b+c*")); assert_eq!(inexact([I("ab"), E("a")]), e(r"ab*")); assert_eq!(inexact([I("ab"), E("ac")]), e(r"ab*c")); assert_eq!(inexact([I("ab")]), e(r"ab+")); assert_eq!(inexact([I("ab")]), e(r"ab+c")); assert_eq!(inexact([I("z"), E("azb")]), e(r"z*azb")); let expected = exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]); assert_eq!(expected, e(r"[ab]{3}")); let expected = inexact([ I("aaa"), I("aab"), I("aba"), I("abb"), I("baa"), I("bab"), I("bba"), I("bbb"), ]); assert_eq!(expected, e(r"[ab]{3,4}")); } #[test] fn concat() { assert_eq!(exact(["abcxyz"]), e(r"abc()xyz")); assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)")); assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz")); assert_eq!(Seq::infinite(), e(r"abc[a&&b]xyz")); assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz")); } #[test] fn alternation() { assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz")); assert_eq!( inexact([E("abc"), I("mZ"), E("mo"), E("xyz")]), e(r"abc|mZ*o|xyz") ); assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz")); assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz")); assert_eq!(exact(["aaa"]), e(r"(?:|aa)aaa")); assert_eq!(Seq::infinite(), e(r"(?:|aa)(?:aaa)*")); assert_eq!(Seq::infinite(), e(r"(?:|aa)(?:aaa)*?")); assert_eq!(Seq::infinite(), e(r"a|b*")); assert_eq!(inexact([E("a"), I("b")]), e(r"a|b+")); assert_eq!(inexact([I("a"), E("b"), E("c")]), e(r"a*b|c")); assert_eq!(Seq::infinite(), e(r"a|(?:b|c*)")); assert_eq!(inexact([I("a"), I("b"), E("c")]), e(r"(a|b)*c|(a|ab)*c")); assert_eq!( exact(["abef", "abgh", "cdef", "cdgh"]), e(r"(ab|cd)(ef|gh)") ); assert_eq!( exact([ "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl", "cdghij", "cdghkl", ]), e(r"(ab|cd)(ef|gh)(ij|kl)") ); } #[test] fn impossible() { // N.B. The extractor in this module "optimizes" the sequence and makes // it infinite if it isn't "good." An empty sequence (generated by a // concatenantion containing an expression that can never match) is // considered "not good." Since infinite sequences are not actionably // and disable optimizations, this winds up being okay. // // The literal extractor in regex-syntax doesn't combine these two // steps and makes the caller choose to optimize. That is, it returns // the sequences as they are. Which in this case, for some of the tests // below, would be an empty Seq and not an infinite Seq. assert_eq!(Seq::infinite(), e(r"[a&&b]")); assert_eq!(Seq::infinite(), e(r"a[a&&b]")); assert_eq!(Seq::infinite(), e(r"[a&&b]b")); assert_eq!(Seq::infinite(), e(r"a[a&&b]b")); assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b")); assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b")); assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b")); assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b")); assert_eq!(Seq::infinite(), e(r"[a&&b]*")); assert_eq!(exact(["MN"]), e(r"M[a&&b]*N")); } // This tests patterns that contain something that defeats literal // detection, usually because it would blow some limit on the total number // of literals that can be returned. // // The main idea is that when literal extraction sees something that // it knows will blow a limit, it replaces it with a marker that says // "any literal will match here." While not necessarily true, the // over-estimation is just fine for the purposes of literal extraction, // because the imprecision doesn't matter: too big is too big. // // This is one of the trickier parts of literal extraction, since we need // to make sure all of our literal extraction operations correctly compose // with the markers. // // Note that unlike in regex-syntax, some of these have "inner" literals // extracted where a prefix or suffix would otherwise not be found. #[test] fn anything() { assert_eq!(Seq::infinite(), e(r".")); assert_eq!(Seq::infinite(), e(r"(?s).")); assert_eq!(Seq::infinite(), e(r"[A-Za-z]")); assert_eq!(Seq::infinite(), e(r"[A-Z]")); assert_eq!(Seq::infinite(), e(r"[A-Z]{0}")); assert_eq!(Seq::infinite(), e(r"[A-Z]?")); assert_eq!(Seq::infinite(), e(r"[A-Z]*")); assert_eq!(Seq::infinite(), e(r"[A-Z]+")); assert_eq!(seq([I("1")]), e(r"1[A-Z]")); assert_eq!(seq([I("1")]), e(r"1[A-Z]2")); assert_eq!(seq([E("123")]), e(r"[A-Z]+123")); assert_eq!(seq([I("123")]), e(r"[A-Z]+123[A-Z]+")); assert_eq!(Seq::infinite(), e(r"1|[A-Z]|3")); assert_eq!(seq([E("1"), I("2"), E("3")]), e(r"1|2[A-Z]|3"),); assert_eq!(seq([E("1"), I("2"), E("3")]), e(r"1|[A-Z]2[A-Z]|3"),); assert_eq!(seq([E("1"), E("2"), E("3")]), e(r"1|[A-Z]2|3"),); assert_eq!(seq([E("1"), I("2"), E("4")]), e(r"1|2[A-Z]3|4"),); assert_eq!(seq([E("2")]), e(r"(?:|1)[A-Z]2")); assert_eq!(inexact([I("a")]), e(r"a.z")); } #[test] fn empty() { assert_eq!(Seq::infinite(), e(r"")); assert_eq!(Seq::infinite(), e(r"^")); assert_eq!(Seq::infinite(), e(r"$")); assert_eq!(Seq::infinite(), e(r"(?m:^)")); assert_eq!(Seq::infinite(), e(r"(?m:$)")); assert_eq!(Seq::infinite(), e(r"\b")); assert_eq!(Seq::infinite(), e(r"\B")); assert_eq!(Seq::infinite(), e(r"(?-u:\b)")); assert_eq!(Seq::infinite(), e(r"(?-u:\B)")); } #[test] fn crazy_repeats() { assert_eq!(Seq::infinite(), e(r"(?:){4294967295}")); assert_eq!(Seq::infinite(), e(r"(?:){64}{64}{64}{64}{64}{64}")); assert_eq!(Seq::infinite(), e(r"x{0}{4294967295}")); assert_eq!(Seq::infinite(), e(r"(?:|){4294967295}")); assert_eq!( Seq::infinite(), e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") ); let repa = "a".repeat(100); assert_eq!( inexact([I(&repa)]), e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") ); } #[test] fn optimize() { // This gets a common prefix that isn't too short. let s = e(r"foobarfoobar|foobar|foobarzfoobar|foobarfoobar"); assert_eq!(seq([I("foobar")]), s); // This also finds a common prefix, but since it's only one byte, it // prefers the multiple literals. let s = e(r"abba|akka|abccba"); assert_eq!(exact(["abba", "akka", "abccba"]), s); let s = e(r"sam|samwise"); assert_eq!(seq([E("sam")]), s); // The empty string is poisonous, so our seq becomes infinite, even // though all literals are exact. let s = e(r"foobarfoo|foo||foozfoo|foofoo"); assert_eq!(Seq::infinite(), s); // A space is also poisonous, so our seq becomes infinite. But this // only gets triggered when we don't have a completely exact sequence. // When the sequence is exact, spaces are okay, since we presume that // any prefilter will match a space more quickly than the regex engine. // (When the sequence is exact, there's a chance of the prefilter being // used without needing the regex engine at all.) let s = e(r"foobarfoo|foo| |foofoo"); assert_eq!(Seq::infinite(), s); } } grep-regex-0.1.12/src/matcher.rs000064400000000000000000000620141046102023000145350ustar 00000000000000use { grep_matcher::{ ByteSet, Captures, LineMatchKind, LineTerminator, Match, Matcher, NoError, }, regex_automata::{ meta::Regex, util::captures::Captures as AutomataCaptures, Input, PatternID, }, }; use crate::{config::Config, error::Error, literal::InnerLiterals}; /// A builder for constructing a `Matcher` using regular expressions. /// /// This builder re-exports many of the same options found on the regex crate's /// builder, in addition to a few other options such as smart case, word /// matching and the ability to set a line terminator which may enable certain /// types of optimizations. /// /// The syntax supported is documented as part of the regex crate: /// . #[derive(Clone, Debug)] pub struct RegexMatcherBuilder { config: Config, } impl Default for RegexMatcherBuilder { fn default() -> RegexMatcherBuilder { RegexMatcherBuilder::new() } } impl RegexMatcherBuilder { /// Create a new builder for configuring a regex matcher. pub fn new() -> RegexMatcherBuilder { RegexMatcherBuilder { config: Config::default() } } /// Build a new matcher using the current configuration for the provided /// pattern. /// /// The syntax supported is documented as part of the regex crate: /// . pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a new matcher using the current configuration for the provided /// patterns. The resulting matcher behaves as if all of the patterns /// given are joined together into a single alternation. That is, it /// reports matches where at least one of the given patterns matches. pub fn build_many>( &self, patterns: &[P], ) -> Result { let mut chir = self.config.build_many(patterns)?; // 'whole_line' is a strict subset of 'word', so when it is enabled, // we don't need to both with any specific to word matching. if chir.config().whole_line { chir = chir.into_whole_line(); } else if chir.config().word { chir = chir.into_word(); } let regex = chir.to_regex()?; log::trace!("final regex: {:?}", chir.hir().to_string()); let non_matching_bytes = chir.non_matching_bytes(); // If we can pick out some literals from the regex, then we might be // able to build a faster regex that quickly identifies candidate // matching lines. The regex engine will do what it can on its own, but // we can specifically do a little more when a line terminator is set. // For example, for a regex like `\w+foo\w+`, we can look for `foo`, // and when a match is found, look for the line containing `foo` and // then run the original regex on only that line. (In this case, the // regex engine is likely to handle this case for us since it's so // simple, but the idea applies.) let fast_line_regex = InnerLiterals::new(&chir, ®ex).one_regex()?; // We override the line terminator in case the configured HIR doesn't // support it. let mut config = self.config.clone(); config.line_terminator = chir.line_terminator(); Ok(RegexMatcher { config, regex, fast_line_regex, non_matching_bytes }) } /// Build a new matcher from a plain alternation of literals. /// /// Depending on the configuration set by the builder, this may be able to /// build a matcher substantially faster than by joining the patterns with /// a `|` and calling `build`. pub fn build_literals>( &self, literals: &[B], ) -> Result { self.build_many(literals) } /// Set the value for the case insensitive (`i`) flag. /// /// When enabled, letters in the pattern will match both upper case and /// lower case variants. pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexMatcherBuilder { self.config.case_insensitive = yes; self } /// Whether to enable "smart case" or not. /// /// When smart case is enabled, the builder will automatically enable /// case insensitive matching based on how the pattern is written. Namely, /// case insensitive mode is enabled when both of the following things /// are true: /// /// 1. The pattern contains at least one literal character. For example, /// `a\w` contains a literal (`a`) but `\w` does not. /// 2. Of the literals in the pattern, none of them are considered to be /// uppercase according to Unicode. For example, `foo\pL` has no /// uppercase literals but `Foo\pL` does. pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder { self.config.case_smart = yes; self } /// Set the value for the multi-line matching (`m`) flag. /// /// When enabled, `^` matches the beginning of lines and `$` matches the /// end of lines. /// /// By default, they match beginning/end of the input. pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder { self.config.multi_line = yes; self } /// Set the value for the any character (`s`) flag, where in `.` matches /// anything when `s` is set and matches anything except for new line when /// it is not set (the default). /// /// N.B. "matches anything" means "any byte" when Unicode is disabled and /// means "any valid UTF-8 encoding of any Unicode scalar value" when /// Unicode is enabled. pub fn dot_matches_new_line( &mut self, yes: bool, ) -> &mut RegexMatcherBuilder { self.config.dot_matches_new_line = yes; self } /// Set the value for the greedy swap (`U`) flag. /// /// When enabled, a pattern like `a*` is lazy (tries to find shortest /// match) and `a*?` is greedy (tries to find longest match). /// /// By default, `a*` is greedy and `a*?` is lazy. pub fn swap_greed(&mut self, yes: bool) -> &mut RegexMatcherBuilder { self.config.swap_greed = yes; self } /// Set the value for the ignore whitespace (`x`) flag. /// /// When enabled, whitespace such as new lines and spaces will be ignored /// between expressions of the pattern, and `#` can be used to start a /// comment until the next new line. pub fn ignore_whitespace( &mut self, yes: bool, ) -> &mut RegexMatcherBuilder { self.config.ignore_whitespace = yes; self } /// Set the value for the Unicode (`u`) flag. /// /// Enabled by default. When disabled, character classes such as `\w` only /// match ASCII word characters instead of all Unicode word characters. pub fn unicode(&mut self, yes: bool) -> &mut RegexMatcherBuilder { self.config.unicode = yes; self } /// Whether to support octal syntax or not. /// /// Octal syntax is a little-known way of uttering Unicode codepoints in /// a regular expression. For example, `a`, `\x61`, `\u0061` and /// `\141` are all equivalent regular expressions, where the last example /// shows octal syntax. /// /// While supporting octal syntax isn't in and of itself a problem, it does /// make good error messages harder. That is, in PCRE based regex engines, /// syntax like `\0` invokes a backreference, which is explicitly /// unsupported in Rust's regex engine. However, many users expect it to /// be supported. Therefore, when octal support is disabled, the error /// message will explicitly mention that backreferences aren't supported. /// /// Octal syntax is disabled by default. pub fn octal(&mut self, yes: bool) -> &mut RegexMatcherBuilder { self.config.octal = yes; self } /// Set the approximate size limit of the compiled regular expression. /// /// This roughly corresponds to the number of bytes occupied by a single /// compiled program. If the program exceeds this number, then a /// compilation error is returned. pub fn size_limit(&mut self, bytes: usize) -> &mut RegexMatcherBuilder { self.config.size_limit = bytes; self } /// Set the approximate size of the cache used by the DFA. /// /// This roughly corresponds to the number of bytes that the DFA will /// use while searching. /// /// Note that this is a *per thread* limit. There is no way to set a global /// limit. In particular, if a regex is used from multiple threads /// simultaneously, then each thread may use up to the number of bytes /// specified here. pub fn dfa_size_limit( &mut self, bytes: usize, ) -> &mut RegexMatcherBuilder { self.config.dfa_size_limit = bytes; self } /// Set the nesting limit for this parser. /// /// The nesting limit controls how deep the abstract syntax tree is allowed /// to be. If the AST exceeds the given limit (e.g., with too many nested /// groups), then an error is returned by the parser. /// /// The purpose of this limit is to act as a heuristic to prevent stack /// overflow for consumers that do structural induction on an `Ast` using /// explicit recursion. While this crate never does this (instead using /// constant stack space and moving the call stack to the heap), other /// crates may. /// /// This limit is not checked until the entire Ast is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since this parser /// implementation will limit itself to heap space proportional to the /// length of the pattern string. /// /// Note that a nest limit of `0` will return a nest limit error for most /// patterns but not all. For example, a nest limit of `0` permits `a` but /// not `ab`, since `ab` requires a concatenation, which results in a nest /// depth of `1`. In general, a nest limit is not something that manifests /// in an obvious way in the concrete syntax, therefore, it should not be /// used in a granular way. pub fn nest_limit(&mut self, limit: u32) -> &mut RegexMatcherBuilder { self.config.nest_limit = limit; self } /// Set an ASCII line terminator for the matcher. /// /// The purpose of setting a line terminator is to enable a certain class /// of optimizations that can make line oriented searching faster. Namely, /// when a line terminator is enabled, then the builder will guarantee that /// the resulting matcher will never be capable of producing a match that /// contains the line terminator. Because of this guarantee, users of the /// resulting matcher do not need to slowly execute a search line by line /// for line oriented search. /// /// If the aforementioned guarantee about not matching a line terminator /// cannot be made because of how the pattern was written, then the builder /// will return an error when attempting to construct the matcher. For /// example, the pattern `a\sb` will be transformed such that it can never /// match `a\nb` (when `\n` is the line terminator), but the pattern `a\nb` /// will result in an error since the `\n` cannot be easily removed without /// changing the fundamental intent of the pattern. /// /// If the given line terminator isn't an ASCII byte (`<=127`), then the /// builder will return an error when constructing the matcher. pub fn line_terminator( &mut self, line_term: Option, ) -> &mut RegexMatcherBuilder { self.config.line_terminator = line_term.map(LineTerminator::byte); self } /// Ban a byte from occurring in a regular expression pattern. /// /// If this byte is found in the regex pattern, then an error will be /// returned at construction time. /// /// This is useful when binary detection is enabled. Callers will likely /// want to ban the same byte that is used to detect binary data, i.e., /// the NUL byte. The reason for this is that when binary detection is /// enabled, it's impossible to match a NUL byte because binary detection /// will either quit when one is found, or will convert NUL bytes to line /// terminators to avoid exorbitant heap usage. pub fn ban_byte(&mut self, byte: Option) -> &mut RegexMatcherBuilder { self.config.ban = byte; self } /// Set the line terminator to `\r\n` and enable CRLF matching for `$` in /// regex patterns. /// /// This method sets two distinct settings: /// /// 1. It causes the line terminator for the matcher to be `\r\n`. Namely, /// this prevents the matcher from ever producing a match that contains /// a `\r` or `\n`. /// 2. It enables CRLF mode for `^` and `$`. This means that line anchors /// will treat both `\r` and `\n` as line terminators, but will never /// match between a `\r` and `\n`. /// /// Note that if you do not wish to set the line terminator but would /// still like `$` to match `\r\n` line terminators, then it is valid to /// call `crlf(true)` followed by `line_terminator(None)`. Ordering is /// important, since `crlf` sets the line terminator, but `line_terminator` /// does not touch the `crlf` setting. pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder { if yes { self.config.line_terminator = Some(LineTerminator::crlf()); } else { self.config.line_terminator = None; } self.config.crlf = yes; self } /// Require that all matches occur on word boundaries. /// /// Enabling this option is subtly different than putting `\b` assertions /// on both sides of your pattern. In particular, a `\b` assertion requires /// that one side of it match a word character while the other match a /// non-word character. This option, in contrast, merely requires that /// one side match a non-word character. /// /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a /// word character. However, `-2` with this `word` option enabled will /// match the `-2` in `foo -2 bar`. pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder { self.config.word = yes; self } /// Whether the patterns should be treated as literal strings or not. When /// this is active, all characters, including ones that would normally be /// special regex meta characters, are matched literally. pub fn fixed_strings(&mut self, yes: bool) -> &mut RegexMatcherBuilder { self.config.fixed_strings = yes; self } /// Whether each pattern should match the entire line or not. This is /// equivalent to surrounding the pattern with `(?m:^)` and `(?m:$)`. pub fn whole_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder { self.config.whole_line = yes; self } } /// An implementation of the `Matcher` trait using Rust's standard regex /// library. #[derive(Clone, Debug)] pub struct RegexMatcher { /// The configuration specified by the caller. config: Config, /// The regular expression compiled from the pattern provided by the /// caller. regex: Regex, /// A regex that never reports false negatives but may report false /// positives that is believed to be capable of being matched more quickly /// than `regex`. Typically, this is a single literal or an alternation /// of literals. fast_line_regex: Option, /// A set of bytes that will never appear in a match. non_matching_bytes: ByteSet, } impl RegexMatcher { /// Create a new matcher from the given pattern using the default /// configuration. pub fn new(pattern: &str) -> Result { RegexMatcherBuilder::new().build(pattern) } /// Create a new matcher from the given pattern using the default /// configuration, but matches lines terminated by `\n`. /// /// This is meant to be a convenience constructor for /// using a `RegexMatcherBuilder` and setting its /// [`line_terminator`](RegexMatcherBuilder::method.line_terminator) to /// `\n`. The purpose of using this constructor is to permit special /// optimizations that help speed up line oriented search. These types of /// optimizations are only appropriate when matches span no more than one /// line. For this reason, this constructor will return an error if the /// given pattern contains a literal `\n`. Other uses of `\n` (such as in /// `\s`) are removed transparently. pub fn new_line_matcher(pattern: &str) -> Result { RegexMatcherBuilder::new().line_terminator(Some(b'\n')).build(pattern) } } // This implementation just dispatches on the internal matcher impl except // for the line terminator optimization, which is possibly executed via // `fast_line_regex`. impl Matcher for RegexMatcher { type Captures = RegexCaptures; type Error = NoError; #[inline] fn find_at( &self, haystack: &[u8], at: usize, ) -> Result, NoError> { let input = Input::new(haystack).span(at..haystack.len()); Ok(self.regex.find(input).map(|m| Match::new(m.start(), m.end()))) } #[inline] fn new_captures(&self) -> Result { Ok(RegexCaptures::new(self.regex.create_captures())) } #[inline] fn capture_count(&self) -> usize { self.regex.captures_len() } #[inline] fn capture_index(&self, name: &str) -> Option { self.regex.group_info().to_index(PatternID::ZERO, name) } #[inline] fn try_find_iter( &self, haystack: &[u8], mut matched: F, ) -> Result, NoError> where F: FnMut(Match) -> Result, { for m in self.regex.find_iter(haystack) { match matched(Match::new(m.start(), m.end())) { Ok(true) => continue, Ok(false) => return Ok(Ok(())), Err(err) => return Ok(Err(err)), } } Ok(Ok(())) } #[inline] fn captures_at( &self, haystack: &[u8], at: usize, caps: &mut RegexCaptures, ) -> Result { let input = Input::new(haystack).span(at..haystack.len()); let caps = caps.captures_mut(); self.regex.search_captures(&input, caps); Ok(caps.is_match()) } #[inline] fn shortest_match_at( &self, haystack: &[u8], at: usize, ) -> Result, NoError> { let input = Input::new(haystack).span(at..haystack.len()); Ok(self.regex.search_half(&input).map(|hm| hm.offset())) } #[inline] fn non_matching_bytes(&self) -> Option<&ByteSet> { Some(&self.non_matching_bytes) } #[inline] fn line_terminator(&self) -> Option { self.config.line_terminator } #[inline] fn find_candidate_line( &self, haystack: &[u8], ) -> Result, NoError> { Ok(match self.fast_line_regex { Some(ref regex) => { let input = Input::new(haystack); regex .search_half(&input) .map(|hm| LineMatchKind::Candidate(hm.offset())) } None => { self.shortest_match(haystack)?.map(LineMatchKind::Confirmed) } }) } } /// Represents the match offsets of each capturing group in a match. /// /// The first, or `0`th capture group, always corresponds to the entire match /// and is guaranteed to be present when a match occurs. The next capture /// group, at index `1`, corresponds to the first capturing group in the regex, /// ordered by the position at which the left opening parenthesis occurs. /// /// Note that not all capturing groups are guaranteed to be present in a match. /// For example, in the regex, `(?P\w)|(?P\W)`, only one of `foo` /// or `bar` will ever be set in any given match. /// /// In order to access a capture group by name, you'll need to first find the /// index of the group using the corresponding matcher's `capture_index` /// method, and then use that index with `RegexCaptures::get`. #[derive(Clone, Debug)] pub struct RegexCaptures { /// Where the captures are stored. caps: AutomataCaptures, } impl Captures for RegexCaptures { #[inline] fn len(&self) -> usize { self.caps.group_info().all_group_len() } #[inline] fn get(&self, i: usize) -> Option { self.caps.get_group(i).map(|sp| Match::new(sp.start, sp.end)) } } impl RegexCaptures { #[inline] pub(crate) fn new(caps: AutomataCaptures) -> RegexCaptures { RegexCaptures { caps } } #[inline] pub(crate) fn captures_mut(&mut self) -> &mut AutomataCaptures { &mut self.caps } } #[cfg(test)] mod tests { use grep_matcher::{LineMatchKind, Matcher}; use super::*; // Test that enabling word matches does the right thing and demonstrate // the difference between it and surrounding the regex in `\b`. #[test] fn word() { let matcher = RegexMatcherBuilder::new().word(true).build(r"-2").unwrap(); assert!(matcher.is_match(b"abc -2 foo").unwrap()); let matcher = RegexMatcherBuilder::new().word(false).build(r"\b-2\b").unwrap(); assert!(!matcher.is_match(b"abc -2 foo").unwrap()); } // Test that enabling a line terminator prevents it from matching through // said line terminator. #[test] fn line_terminator() { // This works, because there's no line terminator specified. let matcher = RegexMatcherBuilder::new().build(r"abc\sxyz").unwrap(); assert!(matcher.is_match(b"abc\nxyz").unwrap()); // This doesn't. let matcher = RegexMatcherBuilder::new() .line_terminator(Some(b'\n')) .build(r"abc\sxyz") .unwrap(); assert!(!matcher.is_match(b"abc\nxyz").unwrap()); } // Ensure that the builder returns an error if a line terminator is set // and the regex could not be modified to remove a line terminator. #[test] fn line_terminator_error() { assert!(RegexMatcherBuilder::new() .line_terminator(Some(b'\n')) .build(r"a\nz") .is_err()) } // Test that enabling CRLF permits `$` to match at the end of a line. #[test] fn line_terminator_crlf() { // Test normal use of `$` with a `\n` line terminator. let matcher = RegexMatcherBuilder::new() .multi_line(true) .build(r"abc$") .unwrap(); assert!(matcher.is_match(b"abc\n").unwrap()); // Test that `$` doesn't match at `\r\n` boundary normally. let matcher = RegexMatcherBuilder::new() .multi_line(true) .build(r"abc$") .unwrap(); assert!(!matcher.is_match(b"abc\r\n").unwrap()); // Now check the CRLF handling. let matcher = RegexMatcherBuilder::new() .multi_line(true) .crlf(true) .build(r"abc$") .unwrap(); assert!(matcher.is_match(b"abc\r\n").unwrap()); } // Test that smart case works. #[test] fn case_smart() { let matcher = RegexMatcherBuilder::new().case_smart(true).build(r"abc").unwrap(); assert!(matcher.is_match(b"ABC").unwrap()); let matcher = RegexMatcherBuilder::new().case_smart(true).build(r"aBc").unwrap(); assert!(!matcher.is_match(b"ABC").unwrap()); } // Test that finding candidate lines works as expected. // FIXME: Re-enable this test once inner literal extraction works. #[test] #[ignore] fn candidate_lines() { fn is_confirmed(m: LineMatchKind) -> bool { match m { LineMatchKind::Confirmed(_) => true, _ => false, } } fn is_candidate(m: LineMatchKind) -> bool { match m { LineMatchKind::Candidate(_) => true, _ => false, } } // With no line terminator set, we can't employ any optimizations, // so we get a confirmed match. let matcher = RegexMatcherBuilder::new().build(r"\wfoo\s").unwrap(); let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap(); assert!(is_confirmed(m)); // With a line terminator and a regex specially crafted to have an // easy-to-detect inner literal, we can apply an optimization that // quickly finds candidate matches. let matcher = RegexMatcherBuilder::new() .line_terminator(Some(b'\n')) .build(r"\wfoo\s") .unwrap(); let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap(); assert!(is_candidate(m)); } } grep-regex-0.1.12/src/non_matching.rs000064400000000000000000000122071046102023000155550ustar 00000000000000use { grep_matcher::ByteSet, regex_syntax::{ hir::{self, Hir, HirKind, Look}, utf8::Utf8Sequences, }, }; /// Return a confirmed set of non-matching bytes from the given expression. pub(crate) fn non_matching_bytes(expr: &Hir) -> ByteSet { let mut set = ByteSet::full(); remove_matching_bytes(expr, &mut set); set } /// Remove any bytes from the given set that can occur in a matched produced by /// the given expression. fn remove_matching_bytes(expr: &Hir, set: &mut ByteSet) { match *expr.kind() { HirKind::Empty | HirKind::Look(Look::WordAscii | Look::WordAsciiNegate) | HirKind::Look(Look::WordUnicode | Look::WordUnicodeNegate) | HirKind::Look(Look::WordStartAscii | Look::WordStartUnicode) | HirKind::Look(Look::WordEndAscii | Look::WordEndUnicode) | HirKind::Look( Look::WordStartHalfAscii | Look::WordStartHalfUnicode, ) | HirKind::Look(Look::WordEndHalfAscii | Look::WordEndHalfUnicode) => { } HirKind::Look(Look::Start | Look::End) => { // FIXME: This is wrong, but not doing this leads to incorrect // results because of how anchored searches are implemented in // the 'grep-searcher' crate. set.remove(b'\n'); } HirKind::Look(Look::StartLF | Look::EndLF) => { set.remove(b'\n'); } HirKind::Look(Look::StartCRLF | Look::EndCRLF) => { set.remove(b'\r'); set.remove(b'\n'); } HirKind::Literal(hir::Literal(ref lit)) => { for &b in lit.iter() { set.remove(b); } } HirKind::Class(hir::Class::Unicode(ref cls)) => { for range in cls.iter() { // This is presumably faster than encoding every codepoint // to UTF-8 and then removing those bytes from the set. for seq in Utf8Sequences::new(range.start(), range.end()) { for byte_range in seq.as_slice() { set.remove_all(byte_range.start, byte_range.end); } } } } HirKind::Class(hir::Class::Bytes(ref cls)) => { for range in cls.iter() { set.remove_all(range.start(), range.end()); } } HirKind::Repetition(ref x) => { remove_matching_bytes(&x.sub, set); } HirKind::Capture(ref x) => { remove_matching_bytes(&x.sub, set); } HirKind::Concat(ref xs) => { for x in xs { remove_matching_bytes(x, set); } } HirKind::Alternation(ref xs) => { for x in xs { remove_matching_bytes(x, set); } } } } #[cfg(test)] mod tests { use {grep_matcher::ByteSet, regex_syntax::ParserBuilder}; use super::non_matching_bytes; fn extract(pattern: &str) -> ByteSet { let expr = ParserBuilder::new().utf8(false).build().parse(pattern).unwrap(); non_matching_bytes(&expr) } fn sparse(set: &ByteSet) -> Vec { let mut sparse_set = vec![]; for b in (0..256).map(|b| b as u8) { if set.contains(b) { sparse_set.push(b); } } sparse_set } fn sparse_except(except: &[u8]) -> Vec { let mut except_set = vec![false; 256]; for &b in except { except_set[b as usize] = true; } let mut set = vec![]; for b in (0..256).map(|b| b as u8) { if !except_set[b as usize] { set.push(b); } } set } #[test] fn dot() { assert_eq!( sparse(&extract(".")), vec![ b'\n', 192, 193, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ] ); assert_eq!( sparse(&extract("(?s).")), vec![ 192, 193, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, ] ); assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']); assert_eq!(sparse(&extract("(?s-u).")), vec![]); } #[test] fn literal() { assert_eq!(sparse(&extract("a")), sparse_except(&[b'a'])); assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83])); assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF])); assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF])); } #[test] fn anchor() { // FIXME: The first four tests below should correspond to a full set // of bytes for the non-matching bytes I think. assert_eq!(sparse(&extract(r"^")), sparse_except(&[b'\n'])); assert_eq!(sparse(&extract(r"$")), sparse_except(&[b'\n'])); assert_eq!(sparse(&extract(r"\A")), sparse_except(&[b'\n'])); assert_eq!(sparse(&extract(r"\z")), sparse_except(&[b'\n'])); assert_eq!(sparse(&extract(r"(?m)^")), sparse_except(&[b'\n'])); assert_eq!(sparse(&extract(r"(?m)$")), sparse_except(&[b'\n'])); } } grep-regex-0.1.12/src/strip.rs000064400000000000000000000142711046102023000142550ustar 00000000000000use { grep_matcher::LineTerminator, regex_syntax::hir::{self, Hir, HirKind}, }; use crate::error::{Error, ErrorKind}; /// Return an HIR that is guaranteed to never match the given line terminator, /// if possible. /// /// If the transformation isn't possible, then an error is returned. /// /// In general, if a literal line terminator occurs anywhere in the HIR, then /// this will return an error. However, if the line terminator occurs within /// a character class with at least one other character (that isn't also a line /// terminator), then the line terminator is simply stripped from that class. /// /// If the given line terminator is not ASCII, then this function returns an /// error. /// /// Note that as of regex 1.9, this routine could theoretically be implemented /// without returning an error. Namely, for example, we could turn /// `foo\nbar` into `foo[a&&b]bar`. That is, replace line terminator with a /// sub-expression that can never match anything. Thus, ripgrep would accept /// such regexes and just silently not match anything. Regex versions prior /// to 1.8 don't support such constructs. I ended up deciding to leave the /// existing behavior of returning an error instead. For example: /// /// ```text /// $ echo -n 'foo\nbar\n' | rg 'foo\nbar' /// the literal '"\n"' is not allowed in a regex /// /// Consider enabling multiline mode with the --multiline flag (or -U for short). /// When multiline mode is enabled, new line characters can be matched. /// ``` /// /// This looks like a good error message to me, and even suggests a flag that /// the user can use instead. pub(crate) fn strip_from_match( expr: Hir, line_term: LineTerminator, ) -> Result { if line_term.is_crlf() { let expr1 = strip_from_match_ascii(expr, b'\r')?; strip_from_match_ascii(expr1, b'\n') } else { strip_from_match_ascii(expr, line_term.as_byte()) } } /// The implementation of strip_from_match. The given byte must be ASCII. /// This function returns an error otherwise. It also returns an error if /// it couldn't remove `\n` from the given regex without leaving an empty /// character class in its place. fn strip_from_match_ascii(expr: Hir, byte: u8) -> Result { if !byte.is_ascii() { return Err(Error::new(ErrorKind::InvalidLineTerminator(byte))); } let ch = char::from(byte); let invalid = || Err(Error::new(ErrorKind::NotAllowed(ch.to_string()))); Ok(match expr.into_kind() { HirKind::Empty => Hir::empty(), HirKind::Literal(hir::Literal(lit)) => { if lit.iter().find(|&&b| b == byte).is_some() { return invalid(); } Hir::literal(lit) } HirKind::Class(hir::Class::Unicode(mut cls)) => { if cls.ranges().is_empty() { return Ok(Hir::class(hir::Class::Unicode(cls))); } let remove = hir::ClassUnicode::new(Some( hir::ClassUnicodeRange::new(ch, ch), )); cls.difference(&remove); if cls.ranges().is_empty() { return invalid(); } Hir::class(hir::Class::Unicode(cls)) } HirKind::Class(hir::Class::Bytes(mut cls)) => { if cls.ranges().is_empty() { return Ok(Hir::class(hir::Class::Bytes(cls))); } let remove = hir::ClassBytes::new(Some( hir::ClassBytesRange::new(byte, byte), )); cls.difference(&remove); if cls.ranges().is_empty() { return invalid(); } Hir::class(hir::Class::Bytes(cls)) } HirKind::Look(x) => Hir::look(x), HirKind::Repetition(mut x) => { x.sub = Box::new(strip_from_match_ascii(*x.sub, byte)?); Hir::repetition(x) } HirKind::Capture(mut x) => { x.sub = Box::new(strip_from_match_ascii(*x.sub, byte)?); Hir::capture(x) } HirKind::Concat(xs) => { let xs = xs .into_iter() .map(|e| strip_from_match_ascii(e, byte)) .collect::, Error>>()?; Hir::concat(xs) } HirKind::Alternation(xs) => { let xs = xs .into_iter() .map(|e| strip_from_match_ascii(e, byte)) .collect::, Error>>()?; Hir::alternation(xs) } }) } #[cfg(test)] mod tests { use regex_syntax::Parser; use super::{strip_from_match, LineTerminator}; use crate::error::Error; fn roundtrip(pattern: &str, byte: u8) -> String { roundtrip_line_term(pattern, LineTerminator::byte(byte)).unwrap() } fn roundtrip_crlf(pattern: &str) -> String { roundtrip_line_term(pattern, LineTerminator::crlf()).unwrap() } fn roundtrip_err(pattern: &str, byte: u8) -> Result { roundtrip_line_term(pattern, LineTerminator::byte(byte)) } fn roundtrip_line_term( pattern: &str, line_term: LineTerminator, ) -> Result { let expr1 = Parser::new().parse(pattern).unwrap(); let expr2 = strip_from_match(expr1, line_term)?; Ok(expr2.to_string()) } #[test] fn various() { assert_eq!(roundtrip(r"[a\n]", b'\n'), "a"); assert_eq!(roundtrip(r"[a\n]", b'a'), "\n"); assert_eq!(roundtrip_crlf(r"[a\n]"), "a"); assert_eq!(roundtrip_crlf(r"[a\r]"), "a"); assert_eq!(roundtrip_crlf(r"[a\r\n]"), "a"); assert_eq!(roundtrip(r"(?-u)\s", b'a'), r"(?-u:[\x09-\x0D\x20])"); assert_eq!(roundtrip(r"(?-u)\s", b'\n'), r"(?-u:[\x09\x0B-\x0D\x20])"); assert!(roundtrip_err(r"\n", b'\n').is_err()); assert!(roundtrip_err(r"abc\n", b'\n').is_err()); assert!(roundtrip_err(r"\nabc", b'\n').is_err()); assert!(roundtrip_err(r"abc\nxyz", b'\n').is_err()); assert!(roundtrip_err(r"\x0A", b'\n').is_err()); assert!(roundtrip_err(r"\u000A", b'\n').is_err()); assert!(roundtrip_err(r"\U0000000A", b'\n').is_err()); assert!(roundtrip_err(r"\u{A}", b'\n').is_err()); assert!(roundtrip_err("\n", b'\n').is_err()); } }