globset-0.4.8/.cargo_vcs_info.json0000644000000001120000000000100124760ustar { "git": { "sha1": "caba5c4348767a0379a23d3fa8b247e35db4b53b" } } globset-0.4.8/COPYING000064400000000000000000000001760072674642500123630ustar 00000000000000This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. globset-0.4.8/Cargo.toml0000644000000033320000000000100105030ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "globset" version = "0.4.8" authors = ["Andrew Gallant "] description = "Cross platform single glob and glob set matching. Glob set matching is the\nprocess of matching one or more glob patterns against a single candidate path\nsimultaneously, and returning all of the globs that matched.\n" homepage = "https://github.com/BurntSushi/ripgrep/tree/master/crates/globset" documentation = "https://docs.rs/globset" readme = "README.md" keywords = ["regex", "glob", "multiple", "set", "pattern"] license = "Unlicense/MIT" repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/globset" [lib] name = "globset" bench = false [dependencies.aho-corasick] version = "0.7.3" [dependencies.bstr] version = "0.2.0" features = ["std"] default-features = false [dependencies.fnv] version = "1.0.6" [dependencies.log] version = "0.4.5" [dependencies.regex] version = "1.1.5" features = ["perf", "std"] default-features = false [dependencies.serde] version = "1.0.104" optional = true [dev-dependencies.glob] version = "0.3.0" [dev-dependencies.lazy_static] version = "1" [dev-dependencies.serde_json] version = "1.0.45" [features] serde1 = ["serde"] simd-accel = [] globset-0.4.8/Cargo.toml.orig000064400000000000000000000021130072674642500142100ustar 00000000000000[package] name = "globset" version = "0.4.8" #:version authors = ["Andrew Gallant "] description = """ Cross platform single glob and glob set matching. Glob set matching is the process of matching one or more glob patterns against a single candidate path simultaneously, and returning all of the globs that matched. """ documentation = "https://docs.rs/globset" homepage = "https://github.com/BurntSushi/ripgrep/tree/master/crates/globset" repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/globset" readme = "README.md" keywords = ["regex", "glob", "multiple", "set", "pattern"] license = "Unlicense/MIT" edition = "2018" [lib] name = "globset" bench = false [dependencies] aho-corasick = "0.7.3" bstr = { version = "0.2.0", default-features = false, features = ["std"] } fnv = "1.0.6" log = "0.4.5" regex = { version = "1.1.5", default-features = false, features = ["perf", "std"] } serde = { version = "1.0.104", optional = true } [dev-dependencies] glob = "0.3.0" lazy_static = "1" serde_json = "1.0.45" [features] simd-accel = [] serde1 = ["serde"] globset-0.4.8/LICENSE-MIT000064400000000000000000000020710072674642500127600ustar 00000000000000The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. globset-0.4.8/README.md000064400000000000000000000074320072674642500126110ustar 00000000000000globset ======= Cross platform single glob and glob set matching. Glob set matching is the process of matching one or more glob patterns against a single candidate path simultaneously, and returning all of the globs that matched. [![Build status](https://github.com/BurntSushi/ripgrep/workflows/ci/badge.svg)](https://github.com/BurntSushi/ripgrep/actions) [![](https://img.shields.io/crates/v/globset.svg)](https://crates.io/crates/globset) Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). ### Documentation [https://docs.rs/globset](https://docs.rs/globset) ### Usage Add this to your `Cargo.toml`: ```toml [dependencies] globset = "0.3" ``` ### Features * `serde1`: Enables implementing Serde traits on the `Glob` type. ### Example: one glob This example shows how to match a single glob against a single file path. ```rust use globset::Glob; let glob = Glob::new("*.rs")?.compile_matcher(); assert!(glob.is_match("foo.rs")); assert!(glob.is_match("foo/bar.rs")); assert!(!glob.is_match("Cargo.toml")); ``` ### Example: configuring a glob matcher This example shows how to use a `GlobBuilder` to configure aspects of match semantics. In this example, we prevent wildcards from matching path separators. ```rust use globset::GlobBuilder; let glob = GlobBuilder::new("*.rs") .literal_separator(true).build()?.compile_matcher(); assert!(glob.is_match("foo.rs")); assert!(!glob.is_match("foo/bar.rs")); // no longer matches assert!(!glob.is_match("Cargo.toml")); ``` ### Example: match multiple globs at once This example shows how to match multiple glob patterns at once. ```rust use globset::{Glob, GlobSetBuilder}; let mut builder = GlobSetBuilder::new(); // A GlobBuilder can be used to configure each glob's match semantics // independently. builder.add(Glob::new("*.rs")?); builder.add(Glob::new("src/lib.rs")?); builder.add(Glob::new("src/**/foo.rs")?); let set = builder.build()?; assert_eq!(set.matches("src/bar/baz/foo.rs"), vec![0, 2]); ``` ### Performance This crate implements globs by converting them to regular expressions, and executing them with the [`regex`](https://github.com/rust-lang-nursery/regex) crate. For single glob matching, performance of this crate should be roughly on par with the performance of the [`glob`](https://github.com/rust-lang-nursery/glob) crate. (`*_regex` correspond to benchmarks for this library while `*_glob` correspond to benchmarks for the `glob` library.) Optimizations in the `regex` crate may propel this library past `glob`, particularly when matching longer paths. ``` test ext_glob ... bench: 425 ns/iter (+/- 21) test ext_regex ... bench: 175 ns/iter (+/- 10) test long_glob ... bench: 182 ns/iter (+/- 11) test long_regex ... bench: 173 ns/iter (+/- 10) test short_glob ... bench: 69 ns/iter (+/- 4) test short_regex ... bench: 83 ns/iter (+/- 2) ``` The primary performance advantage of this crate is when matching multiple globs against a single path. With the `glob` crate, one must match each glob synchronously, one after the other. In this crate, many can be matched simultaneously. For example: ``` test many_short_glob ... bench: 1,063 ns/iter (+/- 47) test many_short_regex_set ... bench: 186 ns/iter (+/- 11) ``` ### Comparison with the [`glob`](https://github.com/rust-lang-nursery/glob) crate * Supports alternate "or" globs, e.g., `*.{foo,bar}`. * Can match non-UTF-8 file paths correctly. * Supports matching multiple globs at once. * Doesn't provide a recursive directory iterator of matching file paths, although I believe this crate should grow one eventually. * Supports case insensitive and require-literal-separator match options, but **doesn't** support the require-literal-leading-dot option. globset-0.4.8/UNLICENSE000064400000000000000000000022730072674642500126000ustar 00000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to globset-0.4.8/benches/bench.rs000064400000000000000000000052770072674642500143730ustar 00000000000000/*! This module benchmarks the glob implementation. For benchmarks on the ripgrep tool itself, see the benchsuite directory. */ #![feature(test)] extern crate test; use globset::{Candidate, Glob, GlobMatcher, GlobSet, GlobSetBuilder}; const EXT: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt"; const EXT_PAT: &'static str = "*.txt"; const SHORT: &'static str = "some/needle.txt"; const SHORT_PAT: &'static str = "some/**/needle.txt"; const LONG: &'static str = "some/a/bigger/path/to/the/crazy/needle.txt"; const LONG_PAT: &'static str = "some/**/needle.txt"; fn new_glob(pat: &str) -> glob::Pattern { glob::Pattern::new(pat).unwrap() } fn new_reglob(pat: &str) -> GlobMatcher { Glob::new(pat).unwrap().compile_matcher() } fn new_reglob_many(pats: &[&str]) -> GlobSet { let mut builder = GlobSetBuilder::new(); for pat in pats { builder.add(Glob::new(pat).unwrap()); } builder.build().unwrap() } #[bench] fn ext_glob(b: &mut test::Bencher) { let pat = new_glob(EXT_PAT); b.iter(|| assert!(pat.matches(EXT))); } #[bench] fn ext_regex(b: &mut test::Bencher) { let set = new_reglob(EXT_PAT); let cand = Candidate::new(EXT); b.iter(|| assert!(set.is_match_candidate(&cand))); } #[bench] fn short_glob(b: &mut test::Bencher) { let pat = new_glob(SHORT_PAT); b.iter(|| assert!(pat.matches(SHORT))); } #[bench] fn short_regex(b: &mut test::Bencher) { let set = new_reglob(SHORT_PAT); let cand = Candidate::new(SHORT); b.iter(|| assert!(set.is_match_candidate(&cand))); } #[bench] fn long_glob(b: &mut test::Bencher) { let pat = new_glob(LONG_PAT); b.iter(|| assert!(pat.matches(LONG))); } #[bench] fn long_regex(b: &mut test::Bencher) { let set = new_reglob(LONG_PAT); let cand = Candidate::new(LONG); b.iter(|| assert!(set.is_match_candidate(&cand))); } const MANY_SHORT_GLOBS: &'static [&'static str] = &[ // Taken from a random .gitignore on my system. ".*.swp", "tags", "target", "*.lock", "tmp", "*.csv", "*.fst", "*-got", "*.csv.idx", "words", "98m*", "dict", "test", "months", ]; const MANY_SHORT_SEARCH: &'static str = "98m-blah.csv.idx"; #[bench] fn many_short_glob(b: &mut test::Bencher) { let pats: Vec<_> = MANY_SHORT_GLOBS.iter().map(|&s| new_glob(s)).collect(); b.iter(|| { let mut count = 0; for pat in &pats { if pat.matches(MANY_SHORT_SEARCH) { count += 1; } } assert_eq!(2, count); }) } #[bench] fn many_short_regex_set(b: &mut test::Bencher) { let set = new_reglob_many(MANY_SHORT_GLOBS); b.iter(|| assert_eq!(2, set.matches(MANY_SHORT_SEARCH).iter().count())); } globset-0.4.8/src/glob.rs000064400000000000000000001516070072674642500134160ustar 00000000000000use std::fmt; use std::hash; use std::iter; use std::ops::{Deref, DerefMut}; use std::path::{is_separator, Path}; use std::str; use regex; use regex::bytes::Regex; use crate::{new_regex, Candidate, Error, ErrorKind}; /// Describes a matching strategy for a particular pattern. /// /// This provides a way to more quickly determine whether a pattern matches /// a particular file path in a way that scales with a large number of /// patterns. For example, if many patterns are of the form `*.ext`, then it's /// possible to test whether any of those patterns matches by looking up a /// file path's extension in a hash table. #[derive(Clone, Debug, Eq, PartialEq)] pub enum MatchStrategy { /// A pattern matches if and only if the entire file path matches this /// literal string. Literal(String), /// A pattern matches if and only if the file path's basename matches this /// literal string. BasenameLiteral(String), /// A pattern matches if and only if the file path's extension matches this /// literal string. Extension(String), /// A pattern matches if and only if this prefix literal is a prefix of the /// candidate file path. Prefix(String), /// A pattern matches if and only if this prefix literal is a prefix of the /// candidate file path. /// /// An exception: if `component` is true, then `suffix` must appear at the /// beginning of a file path or immediately following a `/`. Suffix { /// The actual suffix. suffix: String, /// Whether this must start at the beginning of a path component. component: bool, }, /// A pattern matches only if the given extension matches the file path's /// extension. Note that this is a necessary but NOT sufficient criterion. /// Namely, if the extension matches, then a full regex search is still /// required. RequiredExtension(String), /// A regex needs to be used for matching. Regex, } impl MatchStrategy { /// Returns a matching strategy for the given pattern. pub fn new(pat: &Glob) -> MatchStrategy { if let Some(lit) = pat.basename_literal() { MatchStrategy::BasenameLiteral(lit) } else if let Some(lit) = pat.literal() { MatchStrategy::Literal(lit) } else if let Some(ext) = pat.ext() { MatchStrategy::Extension(ext) } else if let Some(prefix) = pat.prefix() { MatchStrategy::Prefix(prefix) } else if let Some((suffix, component)) = pat.suffix() { MatchStrategy::Suffix { suffix: suffix, component: component } } else if let Some(ext) = pat.required_ext() { MatchStrategy::RequiredExtension(ext) } else { MatchStrategy::Regex } } } /// Glob represents a successfully parsed shell glob pattern. /// /// It cannot be used directly to match file paths, but it can be converted /// to a regular expression string or a matcher. #[derive(Clone, Debug, Eq)] pub struct Glob { glob: String, re: String, opts: GlobOptions, tokens: Tokens, } impl PartialEq for Glob { fn eq(&self, other: &Glob) -> bool { self.glob == other.glob && self.opts == other.opts } } impl hash::Hash for Glob { fn hash(&self, state: &mut H) { self.glob.hash(state); self.opts.hash(state); } } impl fmt::Display for Glob { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.glob.fmt(f) } } impl str::FromStr for Glob { type Err = Error; fn from_str(glob: &str) -> Result { Self::new(glob) } } /// A matcher for a single pattern. #[derive(Clone, Debug)] pub struct GlobMatcher { /// The underlying pattern. pat: Glob, /// The pattern, as a compiled regex. re: Regex, } impl GlobMatcher { /// Tests whether the given path matches this pattern or not. pub fn is_match>(&self, path: P) -> bool { self.is_match_candidate(&Candidate::new(path.as_ref())) } /// Tests whether the given path matches this pattern or not. pub fn is_match_candidate(&self, path: &Candidate<'_>) -> bool { self.re.is_match(&path.path) } /// Returns the `Glob` used to compile this matcher. pub fn glob(&self) -> &Glob { &self.pat } } /// A strategic matcher for a single pattern. #[cfg(test)] #[derive(Clone, Debug)] struct GlobStrategic { /// The match strategy to use. strategy: MatchStrategy, /// The underlying pattern. pat: Glob, /// The pattern, as a compiled regex. re: Regex, } #[cfg(test)] impl GlobStrategic { /// Tests whether the given path matches this pattern or not. fn is_match>(&self, path: P) -> bool { self.is_match_candidate(&Candidate::new(path.as_ref())) } /// Tests whether the given path matches this pattern or not. fn is_match_candidate(&self, candidate: &Candidate<'_>) -> bool { let byte_path = &*candidate.path; match self.strategy { MatchStrategy::Literal(ref lit) => lit.as_bytes() == byte_path, MatchStrategy::BasenameLiteral(ref lit) => { lit.as_bytes() == &*candidate.basename } MatchStrategy::Extension(ref ext) => { ext.as_bytes() == &*candidate.ext } MatchStrategy::Prefix(ref pre) => { starts_with(pre.as_bytes(), byte_path) } MatchStrategy::Suffix { ref suffix, component } => { if component && byte_path == &suffix.as_bytes()[1..] { return true; } ends_with(suffix.as_bytes(), byte_path) } MatchStrategy::RequiredExtension(ref ext) => { let ext = ext.as_bytes(); &*candidate.ext == ext && self.re.is_match(byte_path) } MatchStrategy::Regex => self.re.is_match(byte_path), } } } /// A builder for a pattern. /// /// This builder enables configuring the match semantics of a pattern. For /// example, one can make matching case insensitive. /// /// The lifetime `'a` refers to the lifetime of the pattern string. #[derive(Clone, Debug)] pub struct GlobBuilder<'a> { /// The glob pattern to compile. glob: &'a str, /// Options for the pattern. opts: GlobOptions, } #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] struct GlobOptions { /// Whether to match case insensitively. case_insensitive: bool, /// Whether to require a literal separator to match a separator in a file /// path. e.g., when enabled, `*` won't match `/`. literal_separator: bool, /// Whether or not to use `\` to escape special characters. /// e.g., when enabled, `\*` will match a literal `*`. backslash_escape: bool, } impl GlobOptions { fn default() -> GlobOptions { GlobOptions { case_insensitive: false, literal_separator: false, backslash_escape: !is_separator('\\'), } } } #[derive(Clone, Debug, Default, Eq, PartialEq)] struct Tokens(Vec); impl Deref for Tokens { type Target = Vec; fn deref(&self) -> &Vec { &self.0 } } impl DerefMut for Tokens { fn deref_mut(&mut self) -> &mut Vec { &mut self.0 } } #[derive(Clone, Debug, Eq, PartialEq)] enum Token { Literal(char), Any, ZeroOrMore, RecursivePrefix, RecursiveSuffix, RecursiveZeroOrMore, Class { negated: bool, ranges: Vec<(char, char)> }, Alternates(Vec), } impl Glob { /// Builds a new pattern with default options. pub fn new(glob: &str) -> Result { GlobBuilder::new(glob).build() } /// Returns a matcher for this pattern. pub fn compile_matcher(&self) -> GlobMatcher { let re = new_regex(&self.re).expect("regex compilation shouldn't fail"); GlobMatcher { pat: self.clone(), re: re } } /// Returns a strategic matcher. /// /// This isn't exposed because it's not clear whether it's actually /// faster than just running a regex for a *single* pattern. If it /// is faster, then GlobMatcher should do it automatically. #[cfg(test)] fn compile_strategic_matcher(&self) -> GlobStrategic { let strategy = MatchStrategy::new(self); let re = new_regex(&self.re).expect("regex compilation shouldn't fail"); GlobStrategic { strategy: strategy, pat: self.clone(), re: re } } /// Returns the original glob pattern used to build this pattern. pub fn glob(&self) -> &str { &self.glob } /// Returns the regular expression string for this glob. /// /// Note that regular expressions for globs are intended to be matched on /// arbitrary bytes (`&[u8]`) instead of Unicode strings (`&str`). In /// particular, globs are frequently used on file paths, where there is no /// general guarantee that file paths are themselves valid UTF-8. As a /// result, callers will need to ensure that they are using a regex API /// that can match on arbitrary bytes. For example, the /// [`regex`](https://crates.io/regex) /// crate's /// [`Regex`](https://docs.rs/regex/*/regex/struct.Regex.html) /// API is not suitable for this since it matches on `&str`, but its /// [`bytes::Regex`](https://docs.rs/regex/*/regex/bytes/struct.Regex.html) /// API is suitable for this. pub fn regex(&self) -> &str { &self.re } /// Returns the pattern as a literal if and only if the pattern must match /// an entire path exactly. /// /// The basic format of these patterns is `{literal}`. fn literal(&self) -> Option { if self.opts.case_insensitive { return None; } let mut lit = String::new(); for t in &*self.tokens { match *t { Token::Literal(c) => lit.push(c), _ => return None, } } if lit.is_empty() { None } else { Some(lit) } } /// Returns an extension if this pattern matches a file path if and only /// if the file path has the extension returned. /// /// Note that this extension returned differs from the extension that /// std::path::Path::extension returns. Namely, this extension includes /// the '.'. Also, paths like `.rs` are considered to have an extension /// of `.rs`. fn ext(&self) -> Option { if self.opts.case_insensitive { return None; } let start = match self.tokens.get(0) { Some(&Token::RecursivePrefix) => 1, Some(_) => 0, _ => return None, }; match self.tokens.get(start) { Some(&Token::ZeroOrMore) => { // If there was no recursive prefix, then we only permit // `*` if `*` can match a `/`. For example, if `*` can't // match `/`, then `*.c` doesn't match `foo/bar.c`. if start == 0 && self.opts.literal_separator { return None; } } _ => return None, } match self.tokens.get(start + 1) { Some(&Token::Literal('.')) => {} _ => return None, } let mut lit = ".".to_string(); for t in self.tokens[start + 2..].iter() { match *t { Token::Literal('.') | Token::Literal('/') => return None, Token::Literal(c) => lit.push(c), _ => return None, } } if lit.is_empty() { None } else { Some(lit) } } /// This is like `ext`, but returns an extension even if it isn't sufficient /// to imply a match. Namely, if an extension is returned, then it is /// necessary but not sufficient for a match. fn required_ext(&self) -> Option { if self.opts.case_insensitive { return None; } // We don't care at all about the beginning of this pattern. All we // need to check for is if it ends with a literal of the form `.ext`. let mut ext: Vec = vec![]; // built in reverse for t in self.tokens.iter().rev() { match *t { Token::Literal('/') => return None, Token::Literal(c) => { ext.push(c); if c == '.' { break; } } _ => return None, } } if ext.last() != Some(&'.') { None } else { ext.reverse(); Some(ext.into_iter().collect()) } } /// Returns a literal prefix of this pattern if the entire pattern matches /// if the literal prefix matches. fn prefix(&self) -> Option { if self.opts.case_insensitive { return None; } let (end, need_sep) = match self.tokens.last() { Some(&Token::ZeroOrMore) => { if self.opts.literal_separator { // If a trailing `*` can't match a `/`, then we can't // assume a match of the prefix corresponds to a match // of the overall pattern. e.g., `foo/*` with // `literal_separator` enabled matches `foo/bar` but not // `foo/bar/baz`, even though `foo/bar/baz` has a `foo/` // literal prefix. return None; } (self.tokens.len() - 1, false) } Some(&Token::RecursiveSuffix) => (self.tokens.len() - 1, true), _ => (self.tokens.len(), false), }; let mut lit = String::new(); for t in &self.tokens[0..end] { match *t { Token::Literal(c) => lit.push(c), _ => return None, } } if need_sep { lit.push('/'); } if lit.is_empty() { None } else { Some(lit) } } /// Returns a literal suffix of this pattern if the entire pattern matches /// if the literal suffix matches. /// /// If a literal suffix is returned and it must match either the entire /// file path or be preceded by a `/`, then also return true. This happens /// with a pattern like `**/foo/bar`. Namely, this pattern matches /// `foo/bar` and `baz/foo/bar`, but not `foofoo/bar`. In this case, the /// suffix returned is `/foo/bar` (but should match the entire path /// `foo/bar`). /// /// When this returns true, the suffix literal is guaranteed to start with /// a `/`. fn suffix(&self) -> Option<(String, bool)> { if self.opts.case_insensitive { return None; } let mut lit = String::new(); let (start, entire) = match self.tokens.get(0) { Some(&Token::RecursivePrefix) => { // We only care if this follows a path component if the next // token is a literal. if let Some(&Token::Literal(_)) = self.tokens.get(1) { lit.push('/'); (1, true) } else { (1, false) } } _ => (0, false), }; let start = match self.tokens.get(start) { Some(&Token::ZeroOrMore) => { // If literal_separator is enabled, then a `*` can't // necessarily match everything, so reporting a suffix match // as a match of the pattern would be a false positive. if self.opts.literal_separator { return None; } start + 1 } _ => start, }; for t in &self.tokens[start..] { match *t { Token::Literal(c) => lit.push(c), _ => return None, } } if lit.is_empty() || lit == "/" { None } else { Some((lit, entire)) } } /// If this pattern only needs to inspect the basename of a file path, /// then the tokens corresponding to only the basename match are returned. /// /// For example, given a pattern of `**/*.foo`, only the tokens /// corresponding to `*.foo` are returned. /// /// Note that this will return None if any match of the basename tokens /// doesn't correspond to a match of the entire pattern. For example, the /// glob `foo` only matches when a file path has a basename of `foo`, but /// doesn't *always* match when a file path has a basename of `foo`. e.g., /// `foo` doesn't match `abc/foo`. fn basename_tokens(&self) -> Option<&[Token]> { if self.opts.case_insensitive { return None; } let start = match self.tokens.get(0) { Some(&Token::RecursivePrefix) => 1, _ => { // With nothing to gobble up the parent portion of a path, // we can't assume that matching on only the basename is // correct. return None; } }; if self.tokens[start..].is_empty() { return None; } for t in &self.tokens[start..] { match *t { Token::Literal('/') => return None, Token::Literal(_) => {} // OK Token::Any | Token::ZeroOrMore => { if !self.opts.literal_separator { // In this case, `*` and `?` can match a path // separator, which means this could reach outside // the basename. return None; } } Token::RecursivePrefix | Token::RecursiveSuffix | Token::RecursiveZeroOrMore => { return None; } Token::Class { .. } | Token::Alternates(..) => { // We *could* be a little smarter here, but either one // of these is going to prevent our literal optimizations // anyway, so give up. return None; } } } Some(&self.tokens[start..]) } /// Returns the pattern as a literal if and only if the pattern exclusively /// matches the basename of a file path *and* is a literal. /// /// The basic format of these patterns is `**/{literal}`, where `{literal}` /// does not contain a path separator. fn basename_literal(&self) -> Option { let tokens = match self.basename_tokens() { None => return None, Some(tokens) => tokens, }; let mut lit = String::new(); for t in tokens { match *t { Token::Literal(c) => lit.push(c), _ => return None, } } Some(lit) } } impl<'a> GlobBuilder<'a> { /// Create a new builder for the pattern given. /// /// The pattern is not compiled until `build` is called. pub fn new(glob: &'a str) -> GlobBuilder<'a> { GlobBuilder { glob: glob, opts: GlobOptions::default() } } /// Parses and builds the pattern. pub fn build(&self) -> Result { let mut p = Parser { glob: &self.glob, stack: vec![Tokens::default()], chars: self.glob.chars().peekable(), prev: None, cur: None, opts: &self.opts, }; p.parse()?; if p.stack.is_empty() { Err(Error { glob: Some(self.glob.to_string()), kind: ErrorKind::UnopenedAlternates, }) } else if p.stack.len() > 1 { Err(Error { glob: Some(self.glob.to_string()), kind: ErrorKind::UnclosedAlternates, }) } else { let tokens = p.stack.pop().unwrap(); Ok(Glob { glob: self.glob.to_string(), re: tokens.to_regex_with(&self.opts), opts: self.opts, tokens: tokens, }) } } /// Toggle whether the pattern matches case insensitively or not. /// /// This is disabled by default. pub fn case_insensitive(&mut self, yes: bool) -> &mut GlobBuilder<'a> { self.opts.case_insensitive = yes; self } /// Toggle whether a literal `/` is required to match a path separator. /// /// By default this is false: `*` and `?` will match `/`. pub fn literal_separator(&mut self, yes: bool) -> &mut GlobBuilder<'a> { self.opts.literal_separator = yes; self } /// When enabled, a back slash (`\`) may be used to escape /// special characters in a glob pattern. Additionally, this will /// prevent `\` from being interpreted as a path separator on all /// platforms. /// /// This is enabled by default on platforms where `\` is not a /// path separator and disabled by default on platforms where `\` /// is a path separator. pub fn backslash_escape(&mut self, yes: bool) -> &mut GlobBuilder<'a> { self.opts.backslash_escape = yes; self } } impl Tokens { /// Convert this pattern to a string that is guaranteed to be a valid /// regular expression and will represent the matching semantics of this /// glob pattern and the options given. fn to_regex_with(&self, options: &GlobOptions) -> String { let mut re = String::new(); re.push_str("(?-u)"); if options.case_insensitive { re.push_str("(?i)"); } re.push('^'); // Special case. If the entire glob is just `**`, then it should match // everything. if self.len() == 1 && self[0] == Token::RecursivePrefix { re.push_str(".*"); re.push('$'); return re; } self.tokens_to_regex(options, &self, &mut re); re.push('$'); re } fn tokens_to_regex( &self, options: &GlobOptions, tokens: &[Token], re: &mut String, ) { for tok in tokens { match *tok { Token::Literal(c) => { re.push_str(&char_to_escaped_literal(c)); } Token::Any => { if options.literal_separator { re.push_str("[^/]"); } else { re.push_str("."); } } Token::ZeroOrMore => { if options.literal_separator { re.push_str("[^/]*"); } else { re.push_str(".*"); } } Token::RecursivePrefix => { re.push_str("(?:/?|.*/)"); } Token::RecursiveSuffix => { re.push_str("/.*"); } Token::RecursiveZeroOrMore => { re.push_str("(?:/|/.*/)"); } Token::Class { negated, ref ranges } => { re.push('['); if negated { re.push('^'); } for r in ranges { if r.0 == r.1 { // Not strictly necessary, but nicer to look at. re.push_str(&char_to_escaped_literal(r.0)); } else { re.push_str(&char_to_escaped_literal(r.0)); re.push('-'); re.push_str(&char_to_escaped_literal(r.1)); } } re.push(']'); } Token::Alternates(ref patterns) => { let mut parts = vec![]; for pat in patterns { let mut altre = String::new(); self.tokens_to_regex(options, &pat, &mut altre); if !altre.is_empty() { parts.push(altre); } } // It is possible to have an empty set in which case the // resulting alternation '()' would be an error. if !parts.is_empty() { re.push('('); re.push_str(&parts.join("|")); re.push(')'); } } } } } } /// Convert a Unicode scalar value to an escaped string suitable for use as /// a literal in a non-Unicode regex. fn char_to_escaped_literal(c: char) -> String { bytes_to_escaped_literal(&c.to_string().into_bytes()) } /// Converts an arbitrary sequence of bytes to a UTF-8 string. All non-ASCII /// code units are converted to their escaped form. fn bytes_to_escaped_literal(bs: &[u8]) -> String { let mut s = String::with_capacity(bs.len()); for &b in bs { if b <= 0x7F { s.push_str(®ex::escape(&(b as char).to_string())); } else { s.push_str(&format!("\\x{:02x}", b)); } } s } struct Parser<'a> { glob: &'a str, stack: Vec, chars: iter::Peekable>, prev: Option, cur: Option, opts: &'a GlobOptions, } impl<'a> Parser<'a> { fn error(&self, kind: ErrorKind) -> Error { Error { glob: Some(self.glob.to_string()), kind: kind } } fn parse(&mut self) -> Result<(), Error> { while let Some(c) = self.bump() { match c { '?' => self.push_token(Token::Any)?, '*' => self.parse_star()?, '[' => self.parse_class()?, '{' => self.push_alternate()?, '}' => self.pop_alternate()?, ',' => self.parse_comma()?, '\\' => self.parse_backslash()?, c => self.push_token(Token::Literal(c))?, } } Ok(()) } fn push_alternate(&mut self) -> Result<(), Error> { if self.stack.len() > 1 { return Err(self.error(ErrorKind::NestedAlternates)); } Ok(self.stack.push(Tokens::default())) } fn pop_alternate(&mut self) -> Result<(), Error> { let mut alts = vec![]; while self.stack.len() >= 2 { alts.push(self.stack.pop().unwrap()); } self.push_token(Token::Alternates(alts)) } fn push_token(&mut self, tok: Token) -> Result<(), Error> { if let Some(ref mut pat) = self.stack.last_mut() { return Ok(pat.push(tok)); } Err(self.error(ErrorKind::UnopenedAlternates)) } fn pop_token(&mut self) -> Result { if let Some(ref mut pat) = self.stack.last_mut() { return Ok(pat.pop().unwrap()); } Err(self.error(ErrorKind::UnopenedAlternates)) } fn have_tokens(&self) -> Result { match self.stack.last() { None => Err(self.error(ErrorKind::UnopenedAlternates)), Some(ref pat) => Ok(!pat.is_empty()), } } fn parse_comma(&mut self) -> Result<(), Error> { // If we aren't inside a group alternation, then don't // treat commas specially. Otherwise, we need to start // a new alternate. if self.stack.len() <= 1 { self.push_token(Token::Literal(',')) } else { Ok(self.stack.push(Tokens::default())) } } fn parse_backslash(&mut self) -> Result<(), Error> { if self.opts.backslash_escape { match self.bump() { None => Err(self.error(ErrorKind::DanglingEscape)), Some(c) => self.push_token(Token::Literal(c)), } } else if is_separator('\\') { // Normalize all patterns to use / as a separator. self.push_token(Token::Literal('/')) } else { self.push_token(Token::Literal('\\')) } } fn parse_star(&mut self) -> Result<(), Error> { let prev = self.prev; if self.peek() != Some('*') { self.push_token(Token::ZeroOrMore)?; return Ok(()); } assert!(self.bump() == Some('*')); if !self.have_tokens()? { if !self.peek().map_or(true, is_separator) { self.push_token(Token::ZeroOrMore)?; self.push_token(Token::ZeroOrMore)?; } else { self.push_token(Token::RecursivePrefix)?; assert!(self.bump().map_or(true, is_separator)); } return Ok(()); } if !prev.map(is_separator).unwrap_or(false) { if self.stack.len() <= 1 || (prev != Some(',') && prev != Some('{')) { self.push_token(Token::ZeroOrMore)?; self.push_token(Token::ZeroOrMore)?; return Ok(()); } } let is_suffix = match self.peek() { None => { assert!(self.bump().is_none()); true } Some(',') | Some('}') if self.stack.len() >= 2 => true, Some(c) if is_separator(c) => { assert!(self.bump().map(is_separator).unwrap_or(false)); false } _ => { self.push_token(Token::ZeroOrMore)?; self.push_token(Token::ZeroOrMore)?; return Ok(()); } }; match self.pop_token()? { Token::RecursivePrefix => { self.push_token(Token::RecursivePrefix)?; } Token::RecursiveSuffix => { self.push_token(Token::RecursiveSuffix)?; } _ => { if is_suffix { self.push_token(Token::RecursiveSuffix)?; } else { self.push_token(Token::RecursiveZeroOrMore)?; } } } Ok(()) } fn parse_class(&mut self) -> Result<(), Error> { fn add_to_last_range( glob: &str, r: &mut (char, char), add: char, ) -> Result<(), Error> { r.1 = add; if r.1 < r.0 { Err(Error { glob: Some(glob.to_string()), kind: ErrorKind::InvalidRange(r.0, r.1), }) } else { Ok(()) } } let mut ranges = vec![]; let negated = match self.chars.peek() { Some(&'!') | Some(&'^') => { let bump = self.bump(); assert!(bump == Some('!') || bump == Some('^')); true } _ => false, }; let mut first = true; let mut in_range = false; loop { let c = match self.bump() { Some(c) => c, // The only way to successfully break this loop is to observe // a ']'. None => return Err(self.error(ErrorKind::UnclosedClass)), }; match c { ']' => { if first { ranges.push((']', ']')); } else { break; } } '-' => { if first { ranges.push(('-', '-')); } else if in_range { // invariant: in_range is only set when there is // already at least one character seen. let r = ranges.last_mut().unwrap(); add_to_last_range(&self.glob, r, '-')?; in_range = false; } else { assert!(!ranges.is_empty()); in_range = true; } } c => { if in_range { // invariant: in_range is only set when there is // already at least one character seen. add_to_last_range( &self.glob, ranges.last_mut().unwrap(), c, )?; } else { ranges.push((c, c)); } in_range = false; } } first = false; } if in_range { // Means that the last character in the class was a '-', so add // it as a literal. ranges.push(('-', '-')); } self.push_token(Token::Class { negated: negated, ranges: ranges }) } fn bump(&mut self) -> Option { self.prev = self.cur; self.cur = self.chars.next(); self.cur } fn peek(&mut self) -> Option { self.chars.peek().map(|&ch| ch) } } #[cfg(test)] fn starts_with(needle: &[u8], haystack: &[u8]) -> bool { needle.len() <= haystack.len() && needle == &haystack[..needle.len()] } #[cfg(test)] fn ends_with(needle: &[u8], haystack: &[u8]) -> bool { if needle.len() > haystack.len() { return false; } needle == &haystack[haystack.len() - needle.len()..] } #[cfg(test)] mod tests { use super::Token::*; use super::{Glob, GlobBuilder, Token}; use crate::{ErrorKind, GlobSetBuilder}; #[derive(Clone, Copy, Debug, Default)] struct Options { casei: Option, litsep: Option, bsesc: Option, } macro_rules! syntax { ($name:ident, $pat:expr, $tokens:expr) => { #[test] fn $name() { let pat = Glob::new($pat).unwrap(); assert_eq!($tokens, pat.tokens.0); } }; } macro_rules! syntaxerr { ($name:ident, $pat:expr, $err:expr) => { #[test] fn $name() { let err = Glob::new($pat).unwrap_err(); assert_eq!(&$err, err.kind()); } }; } macro_rules! toregex { ($name:ident, $pat:expr, $re:expr) => { toregex!($name, $pat, $re, Options::default()); }; ($name:ident, $pat:expr, $re:expr, $options:expr) => { #[test] fn $name() { let mut builder = GlobBuilder::new($pat); if let Some(casei) = $options.casei { builder.case_insensitive(casei); } if let Some(litsep) = $options.litsep { builder.literal_separator(litsep); } if let Some(bsesc) = $options.bsesc { builder.backslash_escape(bsesc); } let pat = builder.build().unwrap(); assert_eq!(format!("(?-u){}", $re), pat.regex()); } }; } macro_rules! matches { ($name:ident, $pat:expr, $path:expr) => { matches!($name, $pat, $path, Options::default()); }; ($name:ident, $pat:expr, $path:expr, $options:expr) => { #[test] fn $name() { let mut builder = GlobBuilder::new($pat); if let Some(casei) = $options.casei { builder.case_insensitive(casei); } if let Some(litsep) = $options.litsep { builder.literal_separator(litsep); } if let Some(bsesc) = $options.bsesc { builder.backslash_escape(bsesc); } let pat = builder.build().unwrap(); let matcher = pat.compile_matcher(); let strategic = pat.compile_strategic_matcher(); let set = GlobSetBuilder::new().add(pat).build().unwrap(); assert!(matcher.is_match($path)); assert!(strategic.is_match($path)); assert!(set.is_match($path)); } }; } macro_rules! nmatches { ($name:ident, $pat:expr, $path:expr) => { nmatches!($name, $pat, $path, Options::default()); }; ($name:ident, $pat:expr, $path:expr, $options:expr) => { #[test] fn $name() { let mut builder = GlobBuilder::new($pat); if let Some(casei) = $options.casei { builder.case_insensitive(casei); } if let Some(litsep) = $options.litsep { builder.literal_separator(litsep); } if let Some(bsesc) = $options.bsesc { builder.backslash_escape(bsesc); } let pat = builder.build().unwrap(); let matcher = pat.compile_matcher(); let strategic = pat.compile_strategic_matcher(); let set = GlobSetBuilder::new().add(pat).build().unwrap(); assert!(!matcher.is_match($path)); assert!(!strategic.is_match($path)); assert!(!set.is_match($path)); } }; } fn s(string: &str) -> String { string.to_string() } fn class(s: char, e: char) -> Token { Class { negated: false, ranges: vec![(s, e)] } } fn classn(s: char, e: char) -> Token { Class { negated: true, ranges: vec![(s, e)] } } fn rclass(ranges: &[(char, char)]) -> Token { Class { negated: false, ranges: ranges.to_vec() } } fn rclassn(ranges: &[(char, char)]) -> Token { Class { negated: true, ranges: ranges.to_vec() } } syntax!(literal1, "a", vec![Literal('a')]); syntax!(literal2, "ab", vec![Literal('a'), Literal('b')]); syntax!(any1, "?", vec![Any]); syntax!(any2, "a?b", vec![Literal('a'), Any, Literal('b')]); syntax!(seq1, "*", vec![ZeroOrMore]); syntax!(seq2, "a*b", vec![Literal('a'), ZeroOrMore, Literal('b')]); syntax!( seq3, "*a*b*", vec![ZeroOrMore, Literal('a'), ZeroOrMore, Literal('b'), ZeroOrMore,] ); syntax!(rseq1, "**", vec![RecursivePrefix]); syntax!(rseq2, "**/", vec![RecursivePrefix]); syntax!(rseq3, "/**", vec![RecursiveSuffix]); syntax!(rseq4, "/**/", vec![RecursiveZeroOrMore]); syntax!( rseq5, "a/**/b", vec![Literal('a'), RecursiveZeroOrMore, Literal('b'),] ); syntax!(cls1, "[a]", vec![class('a', 'a')]); syntax!(cls2, "[!a]", vec![classn('a', 'a')]); syntax!(cls3, "[a-z]", vec![class('a', 'z')]); syntax!(cls4, "[!a-z]", vec![classn('a', 'z')]); syntax!(cls5, "[-]", vec![class('-', '-')]); syntax!(cls6, "[]]", vec![class(']', ']')]); syntax!(cls7, "[*]", vec![class('*', '*')]); syntax!(cls8, "[!!]", vec![classn('!', '!')]); syntax!(cls9, "[a-]", vec![rclass(&[('a', 'a'), ('-', '-')])]); syntax!(cls10, "[-a-z]", vec![rclass(&[('-', '-'), ('a', 'z')])]); syntax!(cls11, "[a-z-]", vec![rclass(&[('a', 'z'), ('-', '-')])]); syntax!( cls12, "[-a-z-]", vec![rclass(&[('-', '-'), ('a', 'z'), ('-', '-')]),] ); syntax!(cls13, "[]-z]", vec![class(']', 'z')]); syntax!(cls14, "[--z]", vec![class('-', 'z')]); syntax!(cls15, "[ --]", vec![class(' ', '-')]); syntax!(cls16, "[0-9a-z]", vec![rclass(&[('0', '9'), ('a', 'z')])]); syntax!(cls17, "[a-z0-9]", vec![rclass(&[('a', 'z'), ('0', '9')])]); syntax!(cls18, "[!0-9a-z]", vec![rclassn(&[('0', '9'), ('a', 'z')])]); syntax!(cls19, "[!a-z0-9]", vec![rclassn(&[('a', 'z'), ('0', '9')])]); syntax!(cls20, "[^a]", vec![classn('a', 'a')]); syntax!(cls21, "[^a-z]", vec![classn('a', 'z')]); syntaxerr!(err_unclosed1, "[", ErrorKind::UnclosedClass); syntaxerr!(err_unclosed2, "[]", ErrorKind::UnclosedClass); syntaxerr!(err_unclosed3, "[!", ErrorKind::UnclosedClass); syntaxerr!(err_unclosed4, "[!]", ErrorKind::UnclosedClass); syntaxerr!(err_range1, "[z-a]", ErrorKind::InvalidRange('z', 'a')); syntaxerr!(err_range2, "[z--]", ErrorKind::InvalidRange('z', '-')); const CASEI: Options = Options { casei: Some(true), litsep: None, bsesc: None }; const SLASHLIT: Options = Options { casei: None, litsep: Some(true), bsesc: None }; const NOBSESC: Options = Options { casei: None, litsep: None, bsesc: Some(false) }; const BSESC: Options = Options { casei: None, litsep: None, bsesc: Some(true) }; toregex!(re_casei, "a", "(?i)^a$", &CASEI); toregex!(re_slash1, "?", r"^[^/]$", SLASHLIT); toregex!(re_slash2, "*", r"^[^/]*$", SLASHLIT); toregex!(re1, "a", "^a$"); toregex!(re2, "?", "^.$"); toregex!(re3, "*", "^.*$"); toregex!(re4, "a?", "^a.$"); toregex!(re5, "?a", "^.a$"); toregex!(re6, "a*", "^a.*$"); toregex!(re7, "*a", "^.*a$"); toregex!(re8, "[*]", r"^[\*]$"); toregex!(re9, "[+]", r"^[\+]$"); toregex!(re10, "+", r"^\+$"); toregex!(re11, "☃", r"^\xe2\x98\x83$"); toregex!(re12, "**", r"^.*$"); toregex!(re13, "**/", r"^.*$"); toregex!(re14, "**/*", r"^(?:/?|.*/).*$"); toregex!(re15, "**/**", r"^.*$"); toregex!(re16, "**/**/*", r"^(?:/?|.*/).*$"); toregex!(re17, "**/**/**", r"^.*$"); toregex!(re18, "**/**/**/*", r"^(?:/?|.*/).*$"); toregex!(re19, "a/**", r"^a/.*$"); toregex!(re20, "a/**/**", r"^a/.*$"); toregex!(re21, "a/**/**/**", r"^a/.*$"); toregex!(re22, "a/**/b", r"^a(?:/|/.*/)b$"); toregex!(re23, "a/**/**/b", r"^a(?:/|/.*/)b$"); toregex!(re24, "a/**/**/**/b", r"^a(?:/|/.*/)b$"); toregex!(re25, "**/b", r"^(?:/?|.*/)b$"); toregex!(re26, "**/**/b", r"^(?:/?|.*/)b$"); toregex!(re27, "**/**/**/b", r"^(?:/?|.*/)b$"); toregex!(re28, "a**", r"^a.*.*$"); toregex!(re29, "**a", r"^.*.*a$"); toregex!(re30, "a**b", r"^a.*.*b$"); toregex!(re31, "***", r"^.*.*.*$"); toregex!(re32, "/a**", r"^/a.*.*$"); toregex!(re33, "/**a", r"^/.*.*a$"); toregex!(re34, "/a**b", r"^/a.*.*b$"); matches!(match1, "a", "a"); matches!(match2, "a*b", "a_b"); matches!(match3, "a*b*c", "abc"); matches!(match4, "a*b*c", "a_b_c"); matches!(match5, "a*b*c", "a___b___c"); matches!(match6, "abc*abc*abc", "abcabcabcabcabcabcabc"); matches!(match7, "a*a*a*a*a*a*a*a*a", "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); matches!(match8, "a*b[xyz]c*d", "abxcdbxcddd"); matches!(match9, "*.rs", ".rs"); matches!(match10, "☃", "☃"); matches!(matchrec1, "some/**/needle.txt", "some/needle.txt"); matches!(matchrec2, "some/**/needle.txt", "some/one/needle.txt"); matches!(matchrec3, "some/**/needle.txt", "some/one/two/needle.txt"); matches!(matchrec4, "some/**/needle.txt", "some/other/needle.txt"); matches!(matchrec5, "**", "abcde"); matches!(matchrec6, "**", ""); matches!(matchrec7, "**", ".asdf"); matches!(matchrec8, "**", "/x/.asdf"); matches!(matchrec9, "some/**/**/needle.txt", "some/needle.txt"); matches!(matchrec10, "some/**/**/needle.txt", "some/one/needle.txt"); matches!(matchrec11, "some/**/**/needle.txt", "some/one/two/needle.txt"); matches!(matchrec12, "some/**/**/needle.txt", "some/other/needle.txt"); matches!(matchrec13, "**/test", "one/two/test"); matches!(matchrec14, "**/test", "one/test"); matches!(matchrec15, "**/test", "test"); matches!(matchrec16, "/**/test", "/one/two/test"); matches!(matchrec17, "/**/test", "/one/test"); matches!(matchrec18, "/**/test", "/test"); matches!(matchrec19, "**/.*", ".abc"); matches!(matchrec20, "**/.*", "abc/.abc"); matches!(matchrec21, "**/foo/bar", "foo/bar"); matches!(matchrec22, ".*/**", ".abc/abc"); matches!(matchrec23, "test/**", "test/"); matches!(matchrec24, "test/**", "test/one"); matches!(matchrec25, "test/**", "test/one/two"); matches!(matchrec26, "some/*/needle.txt", "some/one/needle.txt"); matches!(matchrange1, "a[0-9]b", "a0b"); matches!(matchrange2, "a[0-9]b", "a9b"); matches!(matchrange3, "a[!0-9]b", "a_b"); matches!(matchrange4, "[a-z123]", "1"); matches!(matchrange5, "[1a-z23]", "1"); matches!(matchrange6, "[123a-z]", "1"); matches!(matchrange7, "[abc-]", "-"); matches!(matchrange8, "[-abc]", "-"); matches!(matchrange9, "[-a-c]", "b"); matches!(matchrange10, "[a-c-]", "b"); matches!(matchrange11, "[-]", "-"); matches!(matchrange12, "a[^0-9]b", "a_b"); matches!(matchpat1, "*hello.txt", "hello.txt"); matches!(matchpat2, "*hello.txt", "gareth_says_hello.txt"); matches!(matchpat3, "*hello.txt", "some/path/to/hello.txt"); matches!(matchpat4, "*hello.txt", "some\\path\\to\\hello.txt"); matches!(matchpat5, "*hello.txt", "/an/absolute/path/to/hello.txt"); matches!(matchpat6, "*some/path/to/hello.txt", "some/path/to/hello.txt"); matches!( matchpat7, "*some/path/to/hello.txt", "a/bigger/some/path/to/hello.txt" ); matches!(matchescape, "_[[]_[]]_[?]_[*]_!_", "_[_]_?_*_!_"); matches!(matchcasei1, "aBcDeFg", "aBcDeFg", CASEI); matches!(matchcasei2, "aBcDeFg", "abcdefg", CASEI); matches!(matchcasei3, "aBcDeFg", "ABCDEFG", CASEI); matches!(matchcasei4, "aBcDeFg", "AbCdEfG", CASEI); matches!(matchalt1, "a,b", "a,b"); matches!(matchalt2, ",", ","); matches!(matchalt3, "{a,b}", "a"); matches!(matchalt4, "{a,b}", "b"); matches!(matchalt5, "{**/src/**,foo}", "abc/src/bar"); matches!(matchalt6, "{**/src/**,foo}", "foo"); matches!(matchalt7, "{[}],foo}", "}"); matches!(matchalt8, "{foo}", "foo"); matches!(matchalt9, "{}", ""); matches!(matchalt10, "{,}", ""); matches!(matchalt11, "{*.foo,*.bar,*.wat}", "test.foo"); matches!(matchalt12, "{*.foo,*.bar,*.wat}", "test.bar"); matches!(matchalt13, "{*.foo,*.bar,*.wat}", "test.wat"); matches!(matchslash1, "abc/def", "abc/def", SLASHLIT); #[cfg(unix)] nmatches!(matchslash2, "abc?def", "abc/def", SLASHLIT); #[cfg(not(unix))] nmatches!(matchslash2, "abc?def", "abc\\def", SLASHLIT); nmatches!(matchslash3, "abc*def", "abc/def", SLASHLIT); matches!(matchslash4, "abc[/]def", "abc/def", SLASHLIT); // differs #[cfg(unix)] nmatches!(matchslash5, "abc\\def", "abc/def", SLASHLIT); #[cfg(not(unix))] matches!(matchslash5, "abc\\def", "abc/def", SLASHLIT); matches!(matchbackslash1, "\\[", "[", BSESC); matches!(matchbackslash2, "\\?", "?", BSESC); matches!(matchbackslash3, "\\*", "*", BSESC); matches!(matchbackslash4, "\\[a-z]", "\\a", NOBSESC); matches!(matchbackslash5, "\\?", "\\a", NOBSESC); matches!(matchbackslash6, "\\*", "\\\\", NOBSESC); #[cfg(unix)] matches!(matchbackslash7, "\\a", "a"); #[cfg(not(unix))] matches!(matchbackslash8, "\\a", "/a"); nmatches!(matchnot1, "a*b*c", "abcd"); nmatches!(matchnot2, "abc*abc*abc", "abcabcabcabcabcabcabca"); nmatches!(matchnot3, "some/**/needle.txt", "some/other/notthis.txt"); nmatches!(matchnot4, "some/**/**/needle.txt", "some/other/notthis.txt"); nmatches!(matchnot5, "/**/test", "test"); nmatches!(matchnot6, "/**/test", "/one/notthis"); nmatches!(matchnot7, "/**/test", "/notthis"); nmatches!(matchnot8, "**/.*", "ab.c"); nmatches!(matchnot9, "**/.*", "abc/ab.c"); nmatches!(matchnot10, ".*/**", "a.bc"); nmatches!(matchnot11, ".*/**", "abc/a.bc"); nmatches!(matchnot12, "a[0-9]b", "a_b"); nmatches!(matchnot13, "a[!0-9]b", "a0b"); nmatches!(matchnot14, "a[!0-9]b", "a9b"); nmatches!(matchnot15, "[!-]", "-"); nmatches!(matchnot16, "*hello.txt", "hello.txt-and-then-some"); nmatches!(matchnot17, "*hello.txt", "goodbye.txt"); nmatches!( matchnot18, "*some/path/to/hello.txt", "some/path/to/hello.txt-and-then-some" ); nmatches!( matchnot19, "*some/path/to/hello.txt", "some/other/path/to/hello.txt" ); nmatches!(matchnot20, "a", "foo/a"); nmatches!(matchnot21, "./foo", "foo"); nmatches!(matchnot22, "**/foo", "foofoo"); nmatches!(matchnot23, "**/foo/bar", "foofoo/bar"); nmatches!(matchnot24, "/*.c", "mozilla-sha1/sha1.c"); nmatches!(matchnot25, "*.c", "mozilla-sha1/sha1.c", SLASHLIT); nmatches!( matchnot26, "**/m4/ltoptions.m4", "csharp/src/packages/repositories.config", SLASHLIT ); nmatches!(matchnot27, "a[^0-9]b", "a0b"); nmatches!(matchnot28, "a[^0-9]b", "a9b"); nmatches!(matchnot29, "[^-]", "-"); nmatches!(matchnot30, "some/*/needle.txt", "some/needle.txt"); nmatches!( matchrec31, "some/*/needle.txt", "some/one/two/needle.txt", SLASHLIT ); nmatches!( matchrec32, "some/*/needle.txt", "some/one/two/three/needle.txt", SLASHLIT ); nmatches!(matchrec33, ".*/**", ".abc"); nmatches!(matchrec34, "foo/**", "foo"); macro_rules! extract { ($which:ident, $name:ident, $pat:expr, $expect:expr) => { extract!($which, $name, $pat, $expect, Options::default()); }; ($which:ident, $name:ident, $pat:expr, $expect:expr, $options:expr) => { #[test] fn $name() { let mut builder = GlobBuilder::new($pat); if let Some(casei) = $options.casei { builder.case_insensitive(casei); } if let Some(litsep) = $options.litsep { builder.literal_separator(litsep); } if let Some(bsesc) = $options.bsesc { builder.backslash_escape(bsesc); } let pat = builder.build().unwrap(); assert_eq!($expect, pat.$which()); } }; } macro_rules! literal { ($($tt:tt)*) => { extract!(literal, $($tt)*); } } macro_rules! basetokens { ($($tt:tt)*) => { extract!(basename_tokens, $($tt)*); } } macro_rules! ext { ($($tt:tt)*) => { extract!(ext, $($tt)*); } } macro_rules! required_ext { ($($tt:tt)*) => { extract!(required_ext, $($tt)*); } } macro_rules! prefix { ($($tt:tt)*) => { extract!(prefix, $($tt)*); } } macro_rules! suffix { ($($tt:tt)*) => { extract!(suffix, $($tt)*); } } macro_rules! baseliteral { ($($tt:tt)*) => { extract!(basename_literal, $($tt)*); } } literal!(extract_lit1, "foo", Some(s("foo"))); literal!(extract_lit2, "foo", None, CASEI); literal!(extract_lit3, "/foo", Some(s("/foo"))); literal!(extract_lit4, "/foo/", Some(s("/foo/"))); literal!(extract_lit5, "/foo/bar", Some(s("/foo/bar"))); literal!(extract_lit6, "*.foo", None); literal!(extract_lit7, "foo/bar", Some(s("foo/bar"))); literal!(extract_lit8, "**/foo/bar", None); basetokens!( extract_basetoks1, "**/foo", Some(&*vec![Literal('f'), Literal('o'), Literal('o'),]) ); basetokens!(extract_basetoks2, "**/foo", None, CASEI); basetokens!( extract_basetoks3, "**/foo", Some(&*vec![Literal('f'), Literal('o'), Literal('o'),]), SLASHLIT ); basetokens!(extract_basetoks4, "*foo", None, SLASHLIT); basetokens!(extract_basetoks5, "*foo", None); basetokens!(extract_basetoks6, "**/fo*o", None); basetokens!( extract_basetoks7, "**/fo*o", Some(&*vec![Literal('f'), Literal('o'), ZeroOrMore, Literal('o'),]), SLASHLIT ); ext!(extract_ext1, "**/*.rs", Some(s(".rs"))); ext!(extract_ext2, "**/*.rs.bak", None); ext!(extract_ext3, "*.rs", Some(s(".rs"))); ext!(extract_ext4, "a*.rs", None); ext!(extract_ext5, "/*.c", None); ext!(extract_ext6, "*.c", None, SLASHLIT); ext!(extract_ext7, "*.c", Some(s(".c"))); required_ext!(extract_req_ext1, "*.rs", Some(s(".rs"))); required_ext!(extract_req_ext2, "/foo/bar/*.rs", Some(s(".rs"))); required_ext!(extract_req_ext3, "/foo/bar/*.rs", Some(s(".rs"))); required_ext!(extract_req_ext4, "/foo/bar/.rs", Some(s(".rs"))); required_ext!(extract_req_ext5, ".rs", Some(s(".rs"))); required_ext!(extract_req_ext6, "./rs", None); required_ext!(extract_req_ext7, "foo", None); required_ext!(extract_req_ext8, ".foo/", None); required_ext!(extract_req_ext9, "foo/", None); prefix!(extract_prefix1, "/foo", Some(s("/foo"))); prefix!(extract_prefix2, "/foo/*", Some(s("/foo/"))); prefix!(extract_prefix3, "**/foo", None); prefix!(extract_prefix4, "foo/**", Some(s("foo/"))); suffix!(extract_suffix1, "**/foo/bar", Some((s("/foo/bar"), true))); suffix!(extract_suffix2, "*/foo/bar", Some((s("/foo/bar"), false))); suffix!(extract_suffix3, "*/foo/bar", None, SLASHLIT); suffix!(extract_suffix4, "foo/bar", Some((s("foo/bar"), false))); suffix!(extract_suffix5, "*.foo", Some((s(".foo"), false))); suffix!(extract_suffix6, "*.foo", None, SLASHLIT); suffix!(extract_suffix7, "**/*_test", Some((s("_test"), false))); baseliteral!(extract_baselit1, "**/foo", Some(s("foo"))); baseliteral!(extract_baselit2, "foo", None); baseliteral!(extract_baselit3, "*foo", None); baseliteral!(extract_baselit4, "*/foo", None); } globset-0.4.8/src/lib.rs000064400000000000000000000662170072674642500132430ustar 00000000000000/*! The globset crate provides cross platform single glob and glob set matching. Glob set matching is the process of matching one or more glob patterns against a single candidate path simultaneously, and returning all of the globs that matched. For example, given this set of globs: ```ignore *.rs src/lib.rs src/**/foo.rs ``` and a path `src/bar/baz/foo.rs`, then the set would report the first and third globs as matching. # Example: one glob This example shows how to match a single glob against a single file path. ``` # fn example() -> Result<(), globset::Error> { use globset::Glob; let glob = Glob::new("*.rs")?.compile_matcher(); assert!(glob.is_match("foo.rs")); assert!(glob.is_match("foo/bar.rs")); assert!(!glob.is_match("Cargo.toml")); # Ok(()) } example().unwrap(); ``` # Example: configuring a glob matcher This example shows how to use a `GlobBuilder` to configure aspects of match semantics. In this example, we prevent wildcards from matching path separators. ``` # fn example() -> Result<(), globset::Error> { use globset::GlobBuilder; let glob = GlobBuilder::new("*.rs") .literal_separator(true).build()?.compile_matcher(); assert!(glob.is_match("foo.rs")); assert!(!glob.is_match("foo/bar.rs")); // no longer matches assert!(!glob.is_match("Cargo.toml")); # Ok(()) } example().unwrap(); ``` # Example: match multiple globs at once This example shows how to match multiple glob patterns at once. ``` # fn example() -> Result<(), globset::Error> { use globset::{Glob, GlobSetBuilder}; let mut builder = GlobSetBuilder::new(); // A GlobBuilder can be used to configure each glob's match semantics // independently. builder.add(Glob::new("*.rs")?); builder.add(Glob::new("src/lib.rs")?); builder.add(Glob::new("src/**/foo.rs")?); let set = builder.build()?; assert_eq!(set.matches("src/bar/baz/foo.rs"), vec![0, 2]); # Ok(()) } example().unwrap(); ``` # Syntax Standard Unix-style glob syntax is supported: * `?` matches any single character. (If the `literal_separator` option is enabled, then `?` can never match a path separator.) * `*` matches zero or more characters. (If the `literal_separator` option is enabled, then `*` can never match a path separator.) * `**` recursively matches directories but are only legal in three situations. First, if the glob starts with \*\*/, then it matches all directories. For example, \*\*/foo matches `foo` and `bar/foo` but not `foo/bar`. Secondly, if the glob ends with /\*\*, then it matches all sub-entries. For example, foo/\*\* matches `foo/a` and `foo/a/b`, but not `foo`. Thirdly, if the glob contains /\*\*/ anywhere within the pattern, then it matches zero or more directories. Using `**` anywhere else is illegal (N.B. the glob `**` is allowed and means "match everything"). * `{a,b}` matches `a` or `b` where `a` and `b` are arbitrary glob patterns. (N.B. Nesting `{...}` is not currently allowed.) * `[ab]` matches `a` or `b` where `a` and `b` are characters. Use `[!ab]` to match any character except for `a` and `b`. * Metacharacters such as `*` and `?` can be escaped with character class notation. e.g., `[*]` matches `*`. * When backslash escapes are enabled, a backslash (`\`) will escape all meta characters in a glob. If it precedes a non-meta character, then the slash is ignored. A `\\` will match a literal `\\`. Note that this mode is only enabled on Unix platforms by default, but can be enabled on any platform via the `backslash_escape` setting on `Glob`. A `GlobBuilder` can be used to prevent wildcards from matching path separators, or to enable case insensitive matching. */ #![deny(missing_docs)] use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::error::Error as StdError; use std::fmt; use std::hash; use std::path::Path; use std::str; use aho_corasick::AhoCorasick; use bstr::{ByteSlice, ByteVec, B}; use regex::bytes::{Regex, RegexBuilder, RegexSet}; use crate::glob::MatchStrategy; pub use crate::glob::{Glob, GlobBuilder, GlobMatcher}; use crate::pathutil::{file_name, file_name_ext, normalize_path}; mod glob; mod pathutil; #[cfg(feature = "serde1")] mod serde_impl; /// Represents an error that can occur when parsing a glob pattern. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Error { /// The original glob provided by the caller. glob: Option, /// The kind of error. kind: ErrorKind, } /// The kind of error that can occur when parsing a glob pattern. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ErrorKind { /// **DEPRECATED**. /// /// This error used to occur for consistency with git's glob specification, /// but the specification now accepts all uses of `**`. When `**` does not /// appear adjacent to a path separator or at the beginning/end of a glob, /// it is now treated as two consecutive `*` patterns. As such, this error /// is no longer used. InvalidRecursive, /// Occurs when a character class (e.g., `[abc]`) is not closed. UnclosedClass, /// Occurs when a range in a character (e.g., `[a-z]`) is invalid. For /// example, if the range starts with a lexicographically larger character /// than it ends with. InvalidRange(char, char), /// Occurs when a `}` is found without a matching `{`. UnopenedAlternates, /// Occurs when a `{` is found without a matching `}`. UnclosedAlternates, /// Occurs when an alternating group is nested inside another alternating /// group, e.g., `{{a,b},{c,d}}`. NestedAlternates, /// Occurs when an unescaped '\' is found at the end of a glob. DanglingEscape, /// An error associated with parsing or compiling a regex. Regex(String), /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } impl StdError for Error { fn description(&self) -> &str { self.kind.description() } } impl Error { /// Return the glob that caused this error, if one exists. pub fn glob(&self) -> Option<&str> { self.glob.as_ref().map(|s| &**s) } /// Return the kind of this error. pub fn kind(&self) -> &ErrorKind { &self.kind } } impl ErrorKind { fn description(&self) -> &str { match *self { ErrorKind::InvalidRecursive => { "invalid use of **; must be one path component" } ErrorKind::UnclosedClass => { "unclosed character class; missing ']'" } ErrorKind::InvalidRange(_, _) => "invalid character range", ErrorKind::UnopenedAlternates => { "unopened alternate group; missing '{' \ (maybe escape '}' with '[}]'?)" } ErrorKind::UnclosedAlternates => { "unclosed alternate group; missing '}' \ (maybe escape '{' with '[{]'?)" } ErrorKind::NestedAlternates => { "nested alternate groups are not allowed" } ErrorKind::DanglingEscape => "dangling '\\'", ErrorKind::Regex(ref err) => err, ErrorKind::__Nonexhaustive => unreachable!(), } } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self.glob { None => self.kind.fmt(f), Some(ref glob) => { write!(f, "error parsing glob '{}': {}", glob, self.kind) } } } } impl fmt::Display for ErrorKind { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { ErrorKind::InvalidRecursive | ErrorKind::UnclosedClass | ErrorKind::UnopenedAlternates | ErrorKind::UnclosedAlternates | ErrorKind::NestedAlternates | ErrorKind::DanglingEscape | ErrorKind::Regex(_) => write!(f, "{}", self.description()), ErrorKind::InvalidRange(s, e) => { write!(f, "invalid range; '{}' > '{}'", s, e) } ErrorKind::__Nonexhaustive => unreachable!(), } } } fn new_regex(pat: &str) -> Result { RegexBuilder::new(pat) .dot_matches_new_line(true) .size_limit(10 * (1 << 20)) .dfa_size_limit(10 * (1 << 20)) .build() .map_err(|err| Error { glob: Some(pat.to_string()), kind: ErrorKind::Regex(err.to_string()), }) } fn new_regex_set(pats: I) -> Result where S: AsRef, I: IntoIterator, { RegexSet::new(pats).map_err(|err| Error { glob: None, kind: ErrorKind::Regex(err.to_string()), }) } type Fnv = hash::BuildHasherDefault; /// GlobSet represents a group of globs that can be matched together in a /// single pass. #[derive(Clone, Debug)] pub struct GlobSet { len: usize, strats: Vec, } impl GlobSet { /// Create an empty `GlobSet`. An empty set matches nothing. #[inline] pub fn empty() -> GlobSet { GlobSet { len: 0, strats: vec![] } } /// Returns true if this set is empty, and therefore matches nothing. #[inline] pub fn is_empty(&self) -> bool { self.len == 0 } /// Returns the number of globs in this set. #[inline] pub fn len(&self) -> usize { self.len } /// Returns true if any glob in this set matches the path given. pub fn is_match>(&self, path: P) -> bool { self.is_match_candidate(&Candidate::new(path.as_ref())) } /// Returns true if any glob in this set matches the path given. /// /// This takes a Candidate as input, which can be used to amortize the /// cost of preparing a path for matching. pub fn is_match_candidate(&self, path: &Candidate<'_>) -> bool { if self.is_empty() { return false; } for strat in &self.strats { if strat.is_match(path) { return true; } } false } /// Returns the sequence number of every glob pattern that matches the /// given path. pub fn matches>(&self, path: P) -> Vec { self.matches_candidate(&Candidate::new(path.as_ref())) } /// Returns the sequence number of every glob pattern that matches the /// given path. /// /// This takes a Candidate as input, which can be used to amortize the /// cost of preparing a path for matching. pub fn matches_candidate(&self, path: &Candidate<'_>) -> Vec { let mut into = vec![]; if self.is_empty() { return into; } self.matches_candidate_into(path, &mut into); into } /// Adds the sequence number of every glob pattern that matches the given /// path to the vec given. /// /// `into` is cleared before matching begins, and contains the set of /// sequence numbers (in ascending order) after matching ends. If no globs /// were matched, then `into` will be empty. pub fn matches_into>( &self, path: P, into: &mut Vec, ) { self.matches_candidate_into(&Candidate::new(path.as_ref()), into); } /// Adds the sequence number of every glob pattern that matches the given /// path to the vec given. /// /// `into` is cleared before matching begins, and contains the set of /// sequence numbers (in ascending order) after matching ends. If no globs /// were matched, then `into` will be empty. /// /// This takes a Candidate as input, which can be used to amortize the /// cost of preparing a path for matching. pub fn matches_candidate_into( &self, path: &Candidate<'_>, into: &mut Vec, ) { into.clear(); if self.is_empty() { return; } for strat in &self.strats { strat.matches_into(path, into); } into.sort(); into.dedup(); } fn new(pats: &[Glob]) -> Result { if pats.is_empty() { return Ok(GlobSet { len: 0, strats: vec![] }); } let mut lits = LiteralStrategy::new(); let mut base_lits = BasenameLiteralStrategy::new(); let mut exts = ExtensionStrategy::new(); let mut prefixes = MultiStrategyBuilder::new(); let mut suffixes = MultiStrategyBuilder::new(); let mut required_exts = RequiredExtensionStrategyBuilder::new(); let mut regexes = MultiStrategyBuilder::new(); for (i, p) in pats.iter().enumerate() { match MatchStrategy::new(p) { MatchStrategy::Literal(lit) => { lits.add(i, lit); } MatchStrategy::BasenameLiteral(lit) => { base_lits.add(i, lit); } MatchStrategy::Extension(ext) => { exts.add(i, ext); } MatchStrategy::Prefix(prefix) => { prefixes.add(i, prefix); } MatchStrategy::Suffix { suffix, component } => { if component { lits.add(i, suffix[1..].to_string()); } suffixes.add(i, suffix); } MatchStrategy::RequiredExtension(ext) => { required_exts.add(i, ext, p.regex().to_owned()); } MatchStrategy::Regex => { log::debug!("glob converted to regex: {:?}", p); regexes.add(i, p.regex().to_owned()); } } } log::debug!( "built glob set; {} literals, {} basenames, {} extensions, \ {} prefixes, {} suffixes, {} required extensions, {} regexes", lits.0.len(), base_lits.0.len(), exts.0.len(), prefixes.literals.len(), suffixes.literals.len(), required_exts.0.len(), regexes.literals.len() ); Ok(GlobSet { len: pats.len(), strats: vec![ GlobSetMatchStrategy::Extension(exts), GlobSetMatchStrategy::BasenameLiteral(base_lits), GlobSetMatchStrategy::Literal(lits), GlobSetMatchStrategy::Suffix(suffixes.suffix()), GlobSetMatchStrategy::Prefix(prefixes.prefix()), GlobSetMatchStrategy::RequiredExtension( required_exts.build()?, ), GlobSetMatchStrategy::Regex(regexes.regex_set()?), ], }) } } impl Default for GlobSet { /// Create a default empty GlobSet. fn default() -> Self { GlobSet::empty() } } /// GlobSetBuilder builds a group of patterns that can be used to /// simultaneously match a file path. #[derive(Clone, Debug)] pub struct GlobSetBuilder { pats: Vec, } impl GlobSetBuilder { /// Create a new GlobSetBuilder. A GlobSetBuilder can be used to add new /// patterns. Once all patterns have been added, `build` should be called /// to produce a `GlobSet`, which can then be used for matching. pub fn new() -> GlobSetBuilder { GlobSetBuilder { pats: vec![] } } /// Builds a new matcher from all of the glob patterns added so far. /// /// Once a matcher is built, no new patterns can be added to it. pub fn build(&self) -> Result { GlobSet::new(&self.pats) } /// Add a new pattern to this set. pub fn add(&mut self, pat: Glob) -> &mut GlobSetBuilder { self.pats.push(pat); self } } /// A candidate path for matching. /// /// All glob matching in this crate operates on `Candidate` values. /// Constructing candidates has a very small cost associated with it, so /// callers may find it beneficial to amortize that cost when matching a single /// path against multiple globs or sets of globs. #[derive(Clone, Debug)] pub struct Candidate<'a> { path: Cow<'a, [u8]>, basename: Cow<'a, [u8]>, ext: Cow<'a, [u8]>, } impl<'a> Candidate<'a> { /// Create a new candidate for matching from the given path. pub fn new + ?Sized>(path: &'a P) -> Candidate<'a> { let path = normalize_path(Vec::from_path_lossy(path.as_ref())); let basename = file_name(&path).unwrap_or(Cow::Borrowed(B(""))); let ext = file_name_ext(&basename).unwrap_or(Cow::Borrowed(B(""))); Candidate { path: path, basename: basename, ext: ext } } fn path_prefix(&self, max: usize) -> &[u8] { if self.path.len() <= max { &*self.path } else { &self.path[..max] } } fn path_suffix(&self, max: usize) -> &[u8] { if self.path.len() <= max { &*self.path } else { &self.path[self.path.len() - max..] } } } #[derive(Clone, Debug)] enum GlobSetMatchStrategy { Literal(LiteralStrategy), BasenameLiteral(BasenameLiteralStrategy), Extension(ExtensionStrategy), Prefix(PrefixStrategy), Suffix(SuffixStrategy), RequiredExtension(RequiredExtensionStrategy), Regex(RegexSetStrategy), } impl GlobSetMatchStrategy { fn is_match(&self, candidate: &Candidate<'_>) -> bool { use self::GlobSetMatchStrategy::*; match *self { Literal(ref s) => s.is_match(candidate), BasenameLiteral(ref s) => s.is_match(candidate), Extension(ref s) => s.is_match(candidate), Prefix(ref s) => s.is_match(candidate), Suffix(ref s) => s.is_match(candidate), RequiredExtension(ref s) => s.is_match(candidate), Regex(ref s) => s.is_match(candidate), } } fn matches_into( &self, candidate: &Candidate<'_>, matches: &mut Vec, ) { use self::GlobSetMatchStrategy::*; match *self { Literal(ref s) => s.matches_into(candidate, matches), BasenameLiteral(ref s) => s.matches_into(candidate, matches), Extension(ref s) => s.matches_into(candidate, matches), Prefix(ref s) => s.matches_into(candidate, matches), Suffix(ref s) => s.matches_into(candidate, matches), RequiredExtension(ref s) => s.matches_into(candidate, matches), Regex(ref s) => s.matches_into(candidate, matches), } } } #[derive(Clone, Debug)] struct LiteralStrategy(BTreeMap, Vec>); impl LiteralStrategy { fn new() -> LiteralStrategy { LiteralStrategy(BTreeMap::new()) } fn add(&mut self, global_index: usize, lit: String) { self.0.entry(lit.into_bytes()).or_insert(vec![]).push(global_index); } fn is_match(&self, candidate: &Candidate<'_>) -> bool { self.0.contains_key(candidate.path.as_bytes()) } #[inline(never)] fn matches_into( &self, candidate: &Candidate<'_>, matches: &mut Vec, ) { if let Some(hits) = self.0.get(candidate.path.as_bytes()) { matches.extend(hits); } } } #[derive(Clone, Debug)] struct BasenameLiteralStrategy(BTreeMap, Vec>); impl BasenameLiteralStrategy { fn new() -> BasenameLiteralStrategy { BasenameLiteralStrategy(BTreeMap::new()) } fn add(&mut self, global_index: usize, lit: String) { self.0.entry(lit.into_bytes()).or_insert(vec![]).push(global_index); } fn is_match(&self, candidate: &Candidate<'_>) -> bool { if candidate.basename.is_empty() { return false; } self.0.contains_key(candidate.basename.as_bytes()) } #[inline(never)] fn matches_into( &self, candidate: &Candidate<'_>, matches: &mut Vec, ) { if candidate.basename.is_empty() { return; } if let Some(hits) = self.0.get(candidate.basename.as_bytes()) { matches.extend(hits); } } } #[derive(Clone, Debug)] struct ExtensionStrategy(HashMap, Vec, Fnv>); impl ExtensionStrategy { fn new() -> ExtensionStrategy { ExtensionStrategy(HashMap::with_hasher(Fnv::default())) } fn add(&mut self, global_index: usize, ext: String) { self.0.entry(ext.into_bytes()).or_insert(vec![]).push(global_index); } fn is_match(&self, candidate: &Candidate<'_>) -> bool { if candidate.ext.is_empty() { return false; } self.0.contains_key(candidate.ext.as_bytes()) } #[inline(never)] fn matches_into( &self, candidate: &Candidate<'_>, matches: &mut Vec, ) { if candidate.ext.is_empty() { return; } if let Some(hits) = self.0.get(candidate.ext.as_bytes()) { matches.extend(hits); } } } #[derive(Clone, Debug)] struct PrefixStrategy { matcher: AhoCorasick, map: Vec, longest: usize, } impl PrefixStrategy { fn is_match(&self, candidate: &Candidate<'_>) -> bool { let path = candidate.path_prefix(self.longest); for m in self.matcher.find_overlapping_iter(path) { if m.start() == 0 { return true; } } false } fn matches_into( &self, candidate: &Candidate<'_>, matches: &mut Vec, ) { let path = candidate.path_prefix(self.longest); for m in self.matcher.find_overlapping_iter(path) { if m.start() == 0 { matches.push(self.map[m.pattern()]); } } } } #[derive(Clone, Debug)] struct SuffixStrategy { matcher: AhoCorasick, map: Vec, longest: usize, } impl SuffixStrategy { fn is_match(&self, candidate: &Candidate<'_>) -> bool { let path = candidate.path_suffix(self.longest); for m in self.matcher.find_overlapping_iter(path) { if m.end() == path.len() { return true; } } false } fn matches_into( &self, candidate: &Candidate<'_>, matches: &mut Vec, ) { let path = candidate.path_suffix(self.longest); for m in self.matcher.find_overlapping_iter(path) { if m.end() == path.len() { matches.push(self.map[m.pattern()]); } } } } #[derive(Clone, Debug)] struct RequiredExtensionStrategy(HashMap, Vec<(usize, Regex)>, Fnv>); impl RequiredExtensionStrategy { fn is_match(&self, candidate: &Candidate<'_>) -> bool { if candidate.ext.is_empty() { return false; } match self.0.get(candidate.ext.as_bytes()) { None => false, Some(regexes) => { for &(_, ref re) in regexes { if re.is_match(candidate.path.as_bytes()) { return true; } } false } } } #[inline(never)] fn matches_into( &self, candidate: &Candidate<'_>, matches: &mut Vec, ) { if candidate.ext.is_empty() { return; } if let Some(regexes) = self.0.get(candidate.ext.as_bytes()) { for &(global_index, ref re) in regexes { if re.is_match(candidate.path.as_bytes()) { matches.push(global_index); } } } } } #[derive(Clone, Debug)] struct RegexSetStrategy { matcher: RegexSet, map: Vec, } impl RegexSetStrategy { fn is_match(&self, candidate: &Candidate<'_>) -> bool { self.matcher.is_match(candidate.path.as_bytes()) } fn matches_into( &self, candidate: &Candidate<'_>, matches: &mut Vec, ) { for i in self.matcher.matches(candidate.path.as_bytes()) { matches.push(self.map[i]); } } } #[derive(Clone, Debug)] struct MultiStrategyBuilder { literals: Vec, map: Vec, longest: usize, } impl MultiStrategyBuilder { fn new() -> MultiStrategyBuilder { MultiStrategyBuilder { literals: vec![], map: vec![], longest: 0 } } fn add(&mut self, global_index: usize, literal: String) { if literal.len() > self.longest { self.longest = literal.len(); } self.map.push(global_index); self.literals.push(literal); } fn prefix(self) -> PrefixStrategy { PrefixStrategy { matcher: AhoCorasick::new_auto_configured(&self.literals), map: self.map, longest: self.longest, } } fn suffix(self) -> SuffixStrategy { SuffixStrategy { matcher: AhoCorasick::new_auto_configured(&self.literals), map: self.map, longest: self.longest, } } fn regex_set(self) -> Result { Ok(RegexSetStrategy { matcher: new_regex_set(self.literals)?, map: self.map, }) } } #[derive(Clone, Debug)] struct RequiredExtensionStrategyBuilder( HashMap, Vec<(usize, String)>>, ); impl RequiredExtensionStrategyBuilder { fn new() -> RequiredExtensionStrategyBuilder { RequiredExtensionStrategyBuilder(HashMap::new()) } fn add(&mut self, global_index: usize, ext: String, regex: String) { self.0 .entry(ext.into_bytes()) .or_insert(vec![]) .push((global_index, regex)); } fn build(self) -> Result { let mut exts = HashMap::with_hasher(Fnv::default()); for (ext, regexes) in self.0.into_iter() { exts.insert(ext.clone(), vec![]); for (global_index, regex) in regexes { let compiled = new_regex(®ex)?; exts.get_mut(&ext).unwrap().push((global_index, compiled)); } } Ok(RequiredExtensionStrategy(exts)) } } #[cfg(test)] mod tests { use super::{GlobSet, GlobSetBuilder}; use crate::glob::Glob; #[test] fn set_works() { let mut builder = GlobSetBuilder::new(); builder.add(Glob::new("src/**/*.rs").unwrap()); builder.add(Glob::new("*.c").unwrap()); builder.add(Glob::new("src/lib.rs").unwrap()); let set = builder.build().unwrap(); assert!(set.is_match("foo.c")); assert!(set.is_match("src/foo.c")); assert!(!set.is_match("foo.rs")); assert!(!set.is_match("tests/foo.rs")); assert!(set.is_match("src/foo.rs")); assert!(set.is_match("src/grep/src/main.rs")); let matches = set.matches("src/lib.rs"); assert_eq!(2, matches.len()); assert_eq!(0, matches[0]); assert_eq!(2, matches[1]); } #[test] fn empty_set_works() { let set = GlobSetBuilder::new().build().unwrap(); assert!(!set.is_match("")); assert!(!set.is_match("a")); } #[test] fn default_set_is_empty_works() { let set: GlobSet = Default::default(); assert!(!set.is_match("")); assert!(!set.is_match("a")); } } globset-0.4.8/src/pathutil.rs000064400000000000000000000077160072674642500143260ustar 00000000000000use std::borrow::Cow; use bstr::{ByteSlice, ByteVec}; /// The final component of the path, if it is a normal file. /// /// If the path terminates in ., .., or consists solely of a root of prefix, /// file_name will return None. pub fn file_name<'a>(path: &Cow<'a, [u8]>) -> Option> { if path.is_empty() { return None; } else if path.last_byte() == Some(b'.') { return None; } let last_slash = path.rfind_byte(b'/').map(|i| i + 1).unwrap_or(0); Some(match *path { Cow::Borrowed(path) => Cow::Borrowed(&path[last_slash..]), Cow::Owned(ref path) => { let mut path = path.clone(); path.drain_bytes(..last_slash); Cow::Owned(path) } }) } /// Return a file extension given a path's file name. /// /// Note that this does NOT match the semantics of std::path::Path::extension. /// Namely, the extension includes the `.` and matching is otherwise more /// liberal. Specifically, the extenion is: /// /// * None, if the file name given is empty; /// * None, if there is no embedded `.`; /// * Otherwise, the portion of the file name starting with the final `.`. /// /// e.g., A file name of `.rs` has an extension `.rs`. /// /// N.B. This is done to make certain glob match optimizations easier. Namely, /// a pattern like `*.rs` is obviously trying to match files with a `rs` /// extension, but it also matches files like `.rs`, which doesn't have an /// extension according to std::path::Path::extension. pub fn file_name_ext<'a>(name: &Cow<'a, [u8]>) -> Option> { if name.is_empty() { return None; } let last_dot_at = match name.rfind_byte(b'.') { None => return None, Some(i) => i, }; Some(match *name { Cow::Borrowed(name) => Cow::Borrowed(&name[last_dot_at..]), Cow::Owned(ref name) => { let mut name = name.clone(); name.drain_bytes(..last_dot_at); Cow::Owned(name) } }) } /// Normalizes a path to use `/` as a separator everywhere, even on platforms /// that recognize other characters as separators. #[cfg(unix)] pub fn normalize_path(path: Cow<'_, [u8]>) -> Cow<'_, [u8]> { // UNIX only uses /, so we're good. path } /// Normalizes a path to use `/` as a separator everywhere, even on platforms /// that recognize other characters as separators. #[cfg(not(unix))] pub fn normalize_path(mut path: Cow<[u8]>) -> Cow<[u8]> { use std::path::is_separator; for i in 0..path.len() { if path[i] == b'/' || !is_separator(path[i] as char) { continue; } path.to_mut()[i] = b'/'; } path } #[cfg(test)] mod tests { use std::borrow::Cow; use bstr::{ByteVec, B}; use super::{file_name_ext, normalize_path}; macro_rules! ext { ($name:ident, $file_name:expr, $ext:expr) => { #[test] fn $name() { let bs = Vec::from($file_name); let got = file_name_ext(&Cow::Owned(bs)); assert_eq!($ext.map(|s| Cow::Borrowed(B(s))), got); } }; } ext!(ext1, "foo.rs", Some(".rs")); ext!(ext2, ".rs", Some(".rs")); ext!(ext3, "..rs", Some(".rs")); ext!(ext4, "", None::<&str>); ext!(ext5, "foo", None::<&str>); macro_rules! normalize { ($name:ident, $path:expr, $expected:expr) => { #[test] fn $name() { let bs = Vec::from_slice($path); let got = normalize_path(Cow::Owned(bs)); assert_eq!($expected.to_vec(), got.into_owned()); } }; } normalize!(normal1, b"foo", b"foo"); normalize!(normal2, b"foo/bar", b"foo/bar"); #[cfg(unix)] normalize!(normal3, b"foo\\bar", b"foo\\bar"); #[cfg(not(unix))] normalize!(normal3, b"foo\\bar", b"foo/bar"); #[cfg(unix)] normalize!(normal4, b"foo\\bar/baz", b"foo\\bar/baz"); #[cfg(not(unix))] normalize!(normal4, b"foo\\bar/baz", b"foo/bar/baz"); } globset-0.4.8/src/serde_impl.rs000064400000000000000000000016140072674642500146060ustar 00000000000000use serde::de::Error; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use crate::Glob; impl Serialize for Glob { fn serialize( &self, serializer: S, ) -> Result { serializer.serialize_str(self.glob()) } } impl<'de> Deserialize<'de> for Glob { fn deserialize>( deserializer: D, ) -> Result { let glob = <&str as Deserialize>::deserialize(deserializer)?; Glob::new(glob).map_err(D::Error::custom) } } #[cfg(test)] mod tests { use Glob; #[test] fn glob_json_works() { let test_glob = Glob::new("src/**/*.rs").unwrap(); let ser = serde_json::to_string(&test_glob).unwrap(); assert_eq!(ser, "\"src/**/*.rs\""); let de: Glob = serde_json::from_str(&ser).unwrap(); assert_eq!(test_glob, de); } }