pcre2-0.2.2/.gitignore010066400017500000144000000000651333554511500127430ustar0000000000000000/target /pcre2-sys/target **/*.rs.bk Cargo.lock tags pcre2-0.2.2/.gitmodules010064400017500000144000000002011345471441500131210ustar0000000000000000[submodule "pcre2-sys/pcre2"] path = pcre2-sys/pcre2 url = https://github.com/BurntSushi/pcre2-mirror branch = release/10.32 pcre2-0.2.2/.travis.yml010064400017500000144000000002011345471441500130550ustar0000000000000000dist: trusty language: rust rust: - 1.33.0 - stable - beta - nightly script: ci/script.sh branches: only: - master pcre2-0.2.2/COPYING010064400017500000144000000001761333565153600120140ustar0000000000000000This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. pcre2-0.2.2/Cargo.toml.orig010064400017500000144000000011511353403710200136260ustar0000000000000000[package] name = "pcre2" version = "0.2.2" #:version authors = ["Andrew Gallant "] description = "High level wrapper library for PCRE2." documentation = "https://docs.rs/pcre2" homepage = "https://github.com/BurntSushi/rust-pcre2" repository = "https://github.com/BurntSushi/rust-pcre2" readme = "README.md" keywords = ["pcre", "pcre2", "regex", "jit", "perl"] license = "Unlicense/MIT" categories = ["text-processing"] edition = "2018" [workspace] members = ["pcre2-sys"] [dependencies] libc = "0.2.46" log = "0.4.5" pcre2-sys = { version = "0.2.0", path = "pcre2-sys" } thread_local = "0.3.6" pcre2-0.2.2/Cargo.toml0000644000000021630000000000000101070ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "pcre2" version = "0.2.2" authors = ["Andrew Gallant "] description = "High level wrapper library for PCRE2." homepage = "https://github.com/BurntSushi/rust-pcre2" documentation = "https://docs.rs/pcre2" readme = "README.md" keywords = ["pcre", "pcre2", "regex", "jit", "perl"] categories = ["text-processing"] license = "Unlicense/MIT" repository = "https://github.com/BurntSushi/rust-pcre2" [dependencies.libc] version = "0.2.46" [dependencies.log] version = "0.4.5" [dependencies.pcre2-sys] version = "0.2.0" [dependencies.thread_local] version = "0.3.6" pcre2-0.2.2/LICENSE-MIT010064400017500000144000000020711333565153600124110ustar0000000000000000The MIT License (MIT) Copyright (c) 2017 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. pcre2-0.2.2/README.md010064400017500000144000000021111333611323200122120ustar0000000000000000pcre2 ===== A high level Rust wrapper library for [PCRE2](https://www.pcre.org/). [![Linux build status](https://api.travis-ci.org/BurntSushi/rust-pcre2.png)](https://travis-ci.org/BurntSushi/rust-pcre2) [![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/rust-pcre2?svg=true)](https://ci.appveyor.com/project/BurntSushi/rust-pcre2) [![](http://meritbadge.herokuapp.com/pcre2)](https://crates.io/crates/pcre2) Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). ### Documentation https://docs.rs/pcre2 ### Usage Add this to your `Cargo.toml`: ```toml [dependencies] pcre2 = "0.1" ``` and this to your crate root: ```rust extern crate pcre2; ``` ### Notes Currently, this is a fairly light layer around PCRE2 itself and does not even come close to covering all of its functionality. There are no specific plans in place to build out the wrapper further, but PRs for making more of PCRE2 available are welcome, although my bandwidth for maintenance is limited. If you're interested in sharing this maintenance burden, please reach out. pcre2-0.2.2/UNLICENSE010064400017500000144000000022731333565153600122310ustar0000000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to pcre2-0.2.2/appveyor.yml010064400017500000144000000011351333614463000133360ustar0000000000000000environment: matrix: - TARGET: x86_64-pc-windows-gnu BITS: 64 MSYS2: 1 - TARGET: x86_64-pc-windows-msvc BITS: 64 - TARGET: i686-pc-windows-gnu BITS: 32 MSYS2: 1 - TARGET: i686-pc-windows-msvc BITS: 32 install: - curl -sSf -o rustup-init.exe https://win.rustup.rs/ - rustup-init.exe -y --default-host %TARGET% - set PATH=%PATH%;C:\Users\appveyor\.cargo\bin - if defined MSYS2 set PATH=C:\msys64\mingw%BITS%\bin;%PATH% - rustc -V - cargo -V build: false test_script: - cargo build --verbose --all - cargo test --verbose --all branches: only: - master pcre2-0.2.2/ci/script.sh010077500017500000144000000001451333611274300132060ustar0000000000000000#!/bin/sh set -ex cargo build --verbose --all cargo doc --verbose --all cargo test --verbose --all pcre2-0.2.2/src/bytes.rs010066400017500000144000001320431353403705700132420ustar0000000000000000use std::cell::RefCell; use std::collections::HashMap; use std::fmt; use std::ops::Index; use std::sync::Arc; use log::debug; use pcre2_sys::{ PCRE2_CASELESS, PCRE2_DOTALL, PCRE2_EXTENDED, PCRE2_MULTILINE, PCRE2_UCP, PCRE2_UTF, PCRE2_NO_UTF_CHECK, PCRE2_UNSET, PCRE2_NEWLINE_ANYCRLF, }; use thread_local::CachedThreadLocal; use crate::error::Error; use crate::ffi::{Code, CompileContext, MatchConfig, MatchData}; /// Match represents a single match of a regex in a subject string. /// /// The lifetime parameter `'s` refers to the lifetime of the matched portion /// of the subject string. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub struct Match<'s> { subject: &'s [u8], start: usize, end: usize, } impl<'s> Match<'s> { /// Returns the starting byte offset of the match in the subject. #[inline] pub fn start(&self) -> usize { self.start } /// Returns the ending byte offset of the match in the subject. #[inline] pub fn end(&self) -> usize { self.end } /// Returns the matched portion of the subject string. #[inline] pub fn as_bytes(&self) -> &'s [u8] { &self.subject[self.start..self.end] } /// Creates a new match from the given subject string and byte offsets. fn new(subject: &'s [u8], start: usize, end: usize) -> Match<'s> { Match { subject, start, end } } #[cfg(test)] fn as_pair(&self) -> (usize, usize) { (self.start, self.end) } } #[derive(Clone, Debug)] struct Config { /// PCRE2_CASELESS caseless: bool, /// PCRE2_DOTALL dotall: bool, /// PCRE2_EXTENDED extended: bool, /// PCRE2_MULTILINE multi_line: bool, /// PCRE2_NEWLINE_ANYCRLF crlf: bool, /// PCRE2_UCP ucp: bool, /// PCRE2_UTF utf: bool, /// PCRE2_NO_UTF_CHECK utf_check: bool, /// use pcre2_jit_compile jit: JITChoice, /// Match-time specific configuration knobs. match_config: MatchConfig, } #[derive(Clone, Debug)] enum JITChoice { /// Never do JIT compilation. Never, /// Always do JIT compilation and return an error if it fails. Always, /// Attempt to do JIT compilation but silently fall back to non-JIT. Attempt, } impl Default for Config { fn default() -> Config { Config { caseless: false, dotall: false, extended: false, multi_line: false, crlf: false, ucp: false, utf: false, utf_check: true, jit: JITChoice::Never, match_config: MatchConfig::default(), } } } /// A builder for configuring the compilation of a PCRE2 regex. #[derive(Clone, Debug)] pub struct RegexBuilder { config: Config, } impl RegexBuilder { /// Create a new builder with a default configuration. pub fn new() -> RegexBuilder { RegexBuilder { config: Config::default() } } /// Compile the given pattern into a PCRE regex using the current /// configuration. /// /// If there was a problem compiling the pattern, then an error is /// returned. pub fn build(&self, pattern: &str) -> Result { let mut options = 0; if self.config.caseless { options |= PCRE2_CASELESS; } if self.config.dotall { options |= PCRE2_DOTALL; } if self.config.extended { options |= PCRE2_EXTENDED; } if self.config.multi_line { options |= PCRE2_MULTILINE; } if self.config.ucp { options |= PCRE2_UCP; options |= PCRE2_UTF; } if self.config.utf { options |= PCRE2_UTF; } let mut ctx = CompileContext::new(); if self.config.crlf { ctx.set_newline(PCRE2_NEWLINE_ANYCRLF) .expect("PCRE2_NEWLINE_ANYCRLF is a legal value"); } let mut code = Code::new(pattern, options, ctx)?; match self.config.jit { JITChoice::Never => {} // fallthrough JITChoice::Always => { code.jit_compile()?; } JITChoice::Attempt => { if let Err(err) = code.jit_compile() { debug!("JIT compilation failed: {}", err); } } } let capture_names = code.capture_names()?; let mut idx = HashMap::new(); for (i, group) in capture_names.iter().enumerate() { if let Some(ref name) = *group { idx.insert(name.to_string(), i); } } Ok(Regex { config: Arc::new(self.config.clone()), pattern: pattern.to_string(), code: Arc::new(code), capture_names: Arc::new(capture_names), capture_names_idx: Arc::new(idx), match_data: CachedThreadLocal::new(), }) } /// Enables case insensitive matching. /// /// If the `utf` option is also set, then Unicode case folding is used /// to determine case insensitivity. When the `utf` option is not set, /// then only standard ASCII case insensitivity is considered. /// /// This option corresponds to the `i` flag. pub fn caseless(&mut self, yes: bool) -> &mut RegexBuilder { self.config.caseless = yes; self } /// Enables "dot all" matching. /// /// When enabled, the `.` metacharacter in the pattern matches any /// character, include `\n`. When disabled (the default), `.` will match /// any character except for `\n`. /// /// This option corresponds to the `s` flag. pub fn dotall(&mut self, yes: bool) -> &mut RegexBuilder { self.config.dotall = yes; self } /// Enable "extended" mode in the pattern, where whitespace is ignored. /// /// This option corresponds to the `x` flag. pub fn extended(&mut self, yes: bool) -> &mut RegexBuilder { self.config.extended = yes; self } /// Enable multiline matching mode. /// /// When enabled, the `^` and `$` anchors will match both at the beginning /// and end of a subject string, in addition to matching at the start of /// a line and the end of a line. When disabled, the `^` and `$` anchors /// will only match at the beginning and end of a subject string. /// /// This option corresponds to the `m` flag. pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { self.config.multi_line = yes; self } /// Enable matching of CRLF as a line terminator. /// /// When enabled, anchors such as `^` and `$` will match any of the /// following as a line terminator: `\r`, `\n` or `\r\n`. /// /// This is disabled by default, in which case, only `\n` is recognized as /// a line terminator. pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { self.config.crlf = yes; self } /// Enable Unicode matching mode. /// /// When enabled, the following patterns become Unicode aware: `\b`, `\B`, /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`. /// /// When set, this implies UTF matching mode. It is not possible to enable /// Unicode matching mode without enabling UTF matching mode. /// /// This is disabled by default. pub fn ucp(&mut self, yes: bool) -> &mut RegexBuilder { self.config.ucp = yes; self } /// Enable UTF matching mode. /// /// When enabled, characters are treated as sequences of code units that /// make up a single codepoint instead of as single bytes. For example, /// this will cause `.` to match any single UTF-8 encoded codepoint, where /// as when this is disabled, `.` will any single byte (except for `\n` in /// both cases, unless "dot all" mode is enabled). /// /// Note that when UTF matching mode is enabled, every search performed /// will do a UTF-8 validation check, which can impact performance. The /// UTF-8 check can be disabled via the `disable_utf_check` option, but it /// is undefined behavior to enable UTF matching mode and search invalid /// UTF-8. /// /// This is disabled by default. pub fn utf(&mut self, yes: bool) -> &mut RegexBuilder { self.config.utf = yes; self } /// When UTF matching mode is enabled, this will disable the UTF checking /// that PCRE2 will normally perform automatically. If UTF matching mode /// is not enabled, then this has no effect. /// /// UTF checking is enabled by default when UTF matching mode is enabled. /// If UTF matching mode is enabled and UTF checking is enabled, then PCRE2 /// will return an error if you attempt to search a subject string that is /// not valid UTF-8. /// /// # Safety /// /// It is undefined behavior to disable the UTF check in UTF matching mode /// and search a subject string that is not valid UTF-8. When the UTF check /// is disabled, callers must guarantee that the subject string is valid /// UTF-8. pub unsafe fn disable_utf_check(&mut self) -> &mut RegexBuilder { self.config.utf_check = false; self } /// Enable PCRE2's JIT and return an error if it's not available. /// /// This generally speeds up matching quite a bit. The downside is that it /// can increase the time it takes to compile a pattern. /// /// If the JIT isn't available or if JIT compilation returns an error, then /// regex compilation will fail with the corresponding error. /// /// This is disabled by default, and always overrides `jit_if_available`. pub fn jit(&mut self, yes: bool) -> &mut RegexBuilder { if yes { self.config.jit = JITChoice::Always; } else { self.config.jit = JITChoice::Never; } self } /// Enable PCRE2's JIT if it's available. /// /// This generally speeds up matching quite a bit. The downside is that it /// can increase the time it takes to compile a pattern. /// /// If the JIT isn't available or if JIT compilation returns an error, /// then a debug message with the error will be emitted and the regex will /// otherwise silently fall back to non-JIT matching. /// /// This is disabled by default, and always overrides `jit`. pub fn jit_if_available(&mut self, yes: bool) -> &mut RegexBuilder { if yes { self.config.jit = JITChoice::Attempt; } else { self.config.jit = JITChoice::Never; } self } /// Set the maximum size of PCRE2's JIT stack, in bytes. If the JIT is /// not enabled, then this has no effect. /// /// When `None` is given, no custom JIT stack will be created, and instead, /// the default JIT stack is used. When the default is used, its maximum /// size is 32 KB. /// /// When this is set, then a new JIT stack will be created with the given /// maximum size as its limit. /// /// Increasing the stack size can be useful for larger regular expressions. /// /// By default, this is set to `None`. pub fn max_jit_stack_size( &mut self, bytes: Option, ) -> &mut RegexBuilder { self.config.match_config.max_jit_stack_size = bytes; self } } /// A compiled PCRE2 regular expression. /// /// This regex is safe to use from multiple threads simultaneously. For top /// performance, it is better to clone a new regex for each thread. pub struct Regex { /// The configuration used to build the regex. config: Arc, /// The original pattern string. pattern: String, /// The underlying compiled PCRE2 object. code: Arc, /// The capture group names for this regex. capture_names: Arc>>, /// A map from capture group name to capture group index. capture_names_idx: Arc>, /// Mutable scratch data used by PCRE2 during matching. /// /// We use the same strategy as Rust's regex crate here, such that each /// thread gets its own match data to support using a Regex object from /// multiple threads simultaneously. If some match data doesn't exist for /// a thread, then a new one is created on demand. match_data: CachedThreadLocal>, } impl Clone for Regex { fn clone(&self) -> Regex { Regex { config: Arc::clone(&self.config), pattern: self.pattern.clone(), code: Arc::clone(&self.code), capture_names: Arc::clone(&self.capture_names), capture_names_idx: Arc::clone(&self.capture_names_idx), match_data: CachedThreadLocal::new(), } } } impl fmt::Debug for Regex { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Regex({:?})", self.pattern) } } impl Regex { /// Compiles a regular expression using the default configuration. /// /// Once compiled, it can be used repeatedly to search, split or replace /// text in a string. /// /// If an invalid expression is given, then an error is returned. /// /// To configure compilation options for the regex, use the /// [`RegexBuilder`](struct.RegexBuilder.html). pub fn new(pattern: &str) -> Result { RegexBuilder::new().build(pattern) } /// Returns true if and only if the regex matches the subject string given. /// /// # Example /// /// Test if some text contains at least one word with exactly 13 ASCII word /// bytes: /// /// ```rust /// # fn example() -> Result<(), ::pcre2::Error> { /// use pcre2::bytes::Regex; /// /// let text = b"I categorically deny having triskaidekaphobia."; /// assert!(Regex::new(r"\b\w{13}\b")?.is_match(text)?); /// # Ok(()) }; example().unwrap() /// ``` pub fn is_match(&self, subject: &[u8]) -> Result { self.is_match_at(subject, 0) } /// Returns the start and end byte range of the leftmost-first match in /// `subject`. If no match exists, then `None` is returned. /// /// # Example /// /// Find the start and end location of the first word with exactly 13 /// ASCII word bytes: /// /// ```rust /// # fn example() -> Result<(), ::pcre2::Error> { /// use pcre2::bytes::Regex; /// /// let text = b"I categorically deny having triskaidekaphobia."; /// let mat = Regex::new(r"\b\w{13}\b")?.find(text)?.unwrap(); /// assert_eq!((mat.start(), mat.end()), (2, 15)); /// # Ok(()) }; example().unwrap() /// ``` pub fn find<'s>( &self, subject: &'s [u8], ) -> Result>, Error> { self.find_at(subject, 0) } /// Returns an iterator for each successive non-overlapping match in /// `subject`, returning the start and end byte indices with respect to /// `subject`. /// /// # Example /// /// Find the start and end location of every word with exactly 13 ASCII /// word bytes: /// /// ```rust /// # fn example() -> Result<(), ::pcre2::Error> { /// use pcre2::bytes::Regex; /// /// let text = b"Retroactively relinquishing remunerations is reprehensible."; /// for result in Regex::new(r"\b\w{13}\b")?.find_iter(text) { /// let mat = result?; /// println!("{:?}", mat); /// } /// # Ok(()) }; example().unwrap() /// ``` pub fn find_iter<'r, 's>(&'r self, subject: &'s [u8]) -> Matches<'r, 's> { Matches { re: self, match_data: self.match_data(), subject: subject, last_end: 0, last_match: None, } } /// Returns the capture groups corresponding to the leftmost-first /// match in `subject`. Capture group `0` always corresponds to the entire /// match. If no match is found, then `None` is returned. /// /// # Examples /// /// Say you have some text with movie names and their release years, /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text /// looking like that, while also extracting the movie name and its release /// year separately. /// /// ```rust /// # fn example() -> Result<(), ::pcre2::Error> { /// use pcre2::bytes::Regex; /// /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")?; /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text)?.unwrap(); /// assert_eq!(&caps[1], &b"Citizen Kane"[..]); /// assert_eq!(&caps[2], &b"1941"[..]); /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); /// // You can also access the groups by index using the Index notation. /// // Note that this will panic on an invalid index. /// assert_eq!(&caps[1], b"Citizen Kane"); /// assert_eq!(&caps[2], b"1941"); /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); /// # Ok(()) }; example().unwrap() /// ``` /// /// Note that the full match is at capture group `0`. Each subsequent /// capture group is indexed by the order of its opening `(`. /// /// We can make this example a bit clearer by using *named* capture groups: /// /// ```rust /// # fn example() -> Result<(), ::pcre2::Error> { /// use pcre2::bytes::Regex; /// /// let re = Regex::new(r"'(?P[^']+)'\s+\((?P<year>\d{4})\)")?; /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(text)?.unwrap(); /// assert_eq!(&caps["title"], &b"Citizen Kane"[..]); /// assert_eq!(&caps["year"], &b"1941"[..]); /// assert_eq!(&caps[0], &b"'Citizen Kane' (1941)"[..]); /// // You can also access the groups by name using the Index notation. /// // Note that this will panic on an invalid group name. /// assert_eq!(&caps["title"], b"Citizen Kane"); /// assert_eq!(&caps["year"], b"1941"); /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); /// # Ok(()) }; example().unwrap() /// ``` /// /// Here we name the capture groups, which we can access with the `name` /// method or the `Index` notation with a `&str`. Note that the named /// capture groups are still accessible with `get` or the `Index` notation /// with a `usize`. /// /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. pub fn captures<'s>( &self, subject: &'s [u8], ) -> Result<Option<Captures<'s>>, Error> { let mut locs = self.capture_locations(); Ok(self.captures_read(&mut locs, subject)?.map(move |_| Captures { subject: subject, locs: locs, idx: Arc::clone(&self.capture_names_idx), })) } /// Returns an iterator over all the non-overlapping capture groups matched /// in `subject`. This is operationally the same as `find_iter`, except it /// yields information about capturing group matches. /// /// # Example /// /// We can use this to find all movie titles and their release years in /// some text, where the movie is formatted like "'Title' (xxxx)": /// /// ```rust /// # fn example() -> Result<(), ::pcre2::Error> { /// use std::str; /// /// use pcre2::bytes::Regex; /// /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)")?; /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; /// for result in re.captures_iter(text) { /// let caps = result?; /// let title = str::from_utf8(&caps["title"]).unwrap(); /// let year = str::from_utf8(&caps["year"]).unwrap(); /// println!("Movie: {:?}, Released: {:?}", title, year); /// } /// // Output: /// // Movie: Citizen Kane, Released: 1941 /// // Movie: The Wizard of Oz, Released: 1939 /// // Movie: M, Released: 1931 /// # Ok(()) }; example().unwrap() /// ``` pub fn captures_iter<'r, 's>( &'r self, subject: &'s [u8], ) -> CaptureMatches<'r, 's> { CaptureMatches { re: self, subject: subject, last_end: 0, last_match: None, } } } /// Advanced or "lower level" search methods. impl Regex { /// Returns the same as is_match, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. pub fn is_match_at( &self, subject: &[u8], start: usize, ) -> Result<bool, Error> { assert!( start <= subject.len(), "start ({}) must be <= subject.len() ({})", start, subject.len() ); let mut options = 0; if !self.config.utf_check { options |= PCRE2_NO_UTF_CHECK; } let match_data = self.match_data(); let mut match_data = match_data.borrow_mut(); // SAFETY: The only unsafe PCRE2 option we potentially use here is // PCRE2_NO_UTF_CHECK, and that only occurs if the caller executes the // `disable_utf_check` method, which propagates the safety contract to // the caller. Ok(unsafe { match_data.find(&self.code, subject, start, options)? }) } /// Returns the same as find, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. pub fn find_at<'s>( &self, subject: &'s [u8], start: usize, ) -> Result<Option<Match<'s>>, Error> { self.find_at_with_match_data(self.match_data(), subject, start) } /// Like find_at, but accepts match data instead of acquiring one itself. /// /// This is useful for implementing the iterator, which permits avoiding /// the synchronization overhead of acquiring the match data. #[inline(always)] fn find_at_with_match_data<'s>( &self, match_data: &RefCell<MatchData>, subject: &'s [u8], start: usize, ) -> Result<Option<Match<'s>>, Error> { assert!( start <= subject.len(), "start ({}) must be <= subject.len() ({})", start, subject.len() ); let mut options = 0; if !self.config.utf_check { options |= PCRE2_NO_UTF_CHECK; } let mut match_data = match_data.borrow_mut(); // SAFETY: The only unsafe PCRE2 option we potentially use here is // PCRE2_NO_UTF_CHECK, and that only occurs if the caller executes the // `disable_utf_check` method, which propagates the safety contract to // the caller. if unsafe { !match_data.find(&self.code, subject, start, options)? } { return Ok(None); } let ovector = match_data.ovector(); let (s, e) = (ovector[0], ovector[1]); Ok(Some(Match::new(&subject[s..e], s, e))) } /// This is like `captures`, but uses /// [`CaptureLocations`](struct.CaptureLocations.html) /// instead of /// [`Captures`](struct.Captures.html) in order to amortize allocations. /// /// To create a `CaptureLocations` value, use the /// `Regex::capture_locations` method. /// /// This returns the overall match if this was successful, which is always /// equivalent to the `0`th capture group. pub fn captures_read<'s>( &self, locs: &mut CaptureLocations, subject: &'s [u8], ) -> Result<Option<Match<'s>>, Error> { self.captures_read_at(locs, subject, 0) } /// Returns the same as `captures_read`, but starts the search at the given /// offset and populates the capture locations given. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. pub fn captures_read_at<'s>( &self, locs: &mut CaptureLocations, subject: &'s [u8], start: usize, ) -> Result<Option<Match<'s>>, Error> { assert!( start <= subject.len(), "start ({}) must be <= subject.len() ({})", start, subject.len() ); let mut options = 0; if !self.config.utf_check { options |= PCRE2_NO_UTF_CHECK; } // SAFETY: The only unsafe PCRE2 option we potentially use here is // PCRE2_NO_UTF_CHECK, and that only occurs if the caller executes the // `disable_utf_check` method, which propagates the safety contract to // the caller. if unsafe { !locs.data.find(&self.code, subject, start, options)? } { return Ok(None); } let ovector = locs.data.ovector(); let (s, e) = (ovector[0], ovector[1]); Ok(Some(Match::new(&subject[s..e], s, e))) } } /// Auxiliary methods. impl Regex { /// Returns the original pattern string for this regex. pub fn as_str(&self) -> &str { &self.pattern } /// Returns a sequence of all capturing groups and their names, if present. /// /// The length of the slice returned is always equal to the result of /// `captures_len`, which is the number of capturing groups (including the /// capturing group for the entire pattern). /// /// Each entry in the slice is the name of the corresponding capturing /// group, if one exists. The first capturing group (at index `0`) is /// always unnamed. /// /// Capturing groups are indexed by the order of the opening parenthesis. pub fn capture_names(&self) -> &[Option<String>] { &self.capture_names } /// Returns the number of capturing groups in the pattern. /// /// This is always 1 more than the number of syntactic groups in the /// pattern, since the first group always corresponds to the entire match. pub fn captures_len(&self) -> usize { self.code.capture_count().expect("a valid capture count from PCRE2") } /// Returns an empty set of capture locations that can be reused in /// multiple calls to `captures_read` or `captures_read_at`. pub fn capture_locations(&self) -> CaptureLocations { CaptureLocations { code: Arc::clone(&self.code), data: self.new_match_data(), } } fn match_data(&self) -> &RefCell<MatchData> { let create = || Box::new(RefCell::new(self.new_match_data())); self.match_data.get_or(create) } fn new_match_data(&self) -> MatchData { MatchData::new(self.config.match_config.clone(), &self.code) } } /// CaptureLocations is a low level representation of the raw offsets of each /// submatch. /// /// Primarily, this type is useful when using `Regex` APIs such as /// `captures_read`, which permits amortizing the allocation in which capture /// match locations are stored. /// /// In order to build a value of this type, you'll need to call the /// `capture_locations` method on the `Regex` being used to execute the search. /// The value returned can then be reused in subsequent searches. pub struct CaptureLocations { code: Arc<Code>, data: MatchData, } impl Clone for CaptureLocations { fn clone(&self) -> CaptureLocations { CaptureLocations { code: Arc::clone(&self.code), data: MatchData::new(self.data.config().clone(), &self.code), } } } impl fmt::Debug for CaptureLocations { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut offsets: Vec<Option<usize>> = vec![]; for &offset in self.data.ovector() { if offset == PCRE2_UNSET { offsets.push(None); } else { offsets.push(Some(offset)); } } write!(f, "CaptureLocations(")?; f.debug_list().entries(offsets).finish()?; write!(f, ")") } } impl CaptureLocations { /// Returns the start and end positions of the Nth capture group. /// /// This returns `None` if `i` is not a valid capture group or if the /// capture group did not match anything. /// /// The positions returned are always byte indices with respect to the /// original subject string matched. #[inline] pub fn get(&self, i: usize) -> Option<(usize, usize)> { let ovec = self.data.ovector(); let s = match ovec.get(i * 2) { None => return None, Some(&s) if s == PCRE2_UNSET => return None, Some(&s) => s, }; let e = match ovec.get(i * 2 + 1) { None => return None, Some(&e) if e == PCRE2_UNSET => return None, Some(&e) => e, }; Some((s, e)) } /// Returns the total number of capturing groups. /// /// This is always at least `1` since every regex has at least `1` /// capturing group that corresponds to the entire match. #[inline] pub fn len(&self) -> usize { self.data.ovector().len() / 2 } } /// Captures represents a group of captured byte strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent /// index corresponds to the next capture group in the regex. If a capture /// group is named, then the matched byte string is *also* available via the /// `name` method. (Note that the 0th capture is always unnamed and so must be /// accessed with the `get` method.) /// /// Positions returned from a capture group are always byte indices. /// /// `'s` is the lifetime of the matched subject string. pub struct Captures<'s> { subject: &'s [u8], locs: CaptureLocations, idx: Arc<HashMap<String, usize>>, } impl<'s> Captures<'s> { /// Returns the match associated with the capture group at index `i`. If /// `i` does not correspond to a capture group, or if the capture group /// did not participate in the match, then `None` is returned. /// /// # Examples /// /// Get the text of the match with a default of an empty string if this /// group didn't participate in the match: /// /// ```rust /// # fn example() -> Result<(), ::pcre2::Error> { /// use pcre2::bytes::Regex; /// /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))")?; /// let caps = re.captures(b"abc123")?.unwrap(); /// /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); /// assert_eq!(text1, &b"123"[..]); /// assert_eq!(text2, &b""[..]); /// # Ok(()) }; example().unwrap() /// ``` pub fn get(&self, i: usize) -> Option<Match<'s>> { self.locs.get(i).map(|(s, e)| Match::new(self.subject, s, e)) } /// Returns the match for the capture group named `name`. If `name` isn't a /// valid capture group or didn't match anything, then `None` is returned. pub fn name(&self, name: &str) -> Option<Match<'s>> { self.idx.get(name).and_then(|&i| self.get(i)) } /// Returns the number of captured groups. /// /// This is always at least `1`, since every regex has at least one capture /// group that corresponds to the full match. #[inline] pub fn len(&self) -> usize { self.locs.len() } } impl<'s> fmt::Debug for Captures<'s> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_tuple("Captures").field(&CapturesDebug(self)).finish() } } struct CapturesDebug<'c, 's: 'c>(&'c Captures<'s>); impl<'c, 's> fmt::Debug for CapturesDebug<'c, 's> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn escape_bytes(bytes: &[u8]) -> String { let mut s = String::new(); for &b in bytes { s.push_str(&escape_byte(b)); } s } fn escape_byte(byte: u8) -> String { use std::ascii::escape_default; let escaped: Vec<u8> = escape_default(byte).collect(); String::from_utf8_lossy(&escaped).into_owned() } // We'd like to show something nice here, even if it means an // allocation to build a reverse index. let slot_to_name: HashMap<&usize, &String> = self.0.idx.iter().map(|(a, b)| (b, a)).collect(); let mut map = f.debug_map(); for slot in 0..self.0.len() { let m = self.0.locs.get(slot).map(|(s, e)| { escape_bytes(&self.0.subject[s..e]) }); if let Some(name) = slot_to_name.get(&slot) { map.entry(&name, &m); } else { map.entry(&slot, &m); } } map.finish() } } /// Get a group by index. /// /// `'s` is the lifetime of the matched subject string. /// /// The subject can't outlive the `Captures` object if this method is /// used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it); to do that, use `get()` instead. /// /// # Panics /// /// If there is no group at the given index. impl<'s> Index<usize> for Captures<'s> { type Output = [u8]; fn index(&self, i: usize) -> &[u8] { self.get(i).map(|m| m.as_bytes()) .unwrap_or_else(|| panic!("no group at index '{}'", i)) } } /// Get a group by name. /// /// `'s` is the lifetime of the matched subject string and `'i` is the lifetime /// of the group name (the index). /// /// The text can't outlive the `Captures` object if this method is /// used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it); to do that, use `name` instead. /// /// # Panics /// /// If there is no group named by the given value. impl<'s, 'i> Index<&'i str> for Captures<'s> { type Output = [u8]; fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { self.name(name).map(|m| m.as_bytes()) .unwrap_or_else(|| panic!("no group named '{}'", name)) } } /// An iterator over all non-overlapping matches for a particular subject /// string. /// /// The iterator yields matches (if no error occurred while searching) /// corresponding to the start and end of the match. The indices are byte /// offsets. The iterator stops when no more matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'s` is the /// lifetime of the subject string. pub struct Matches<'r, 's> { re: &'r Regex, match_data: &'r RefCell<MatchData>, subject: &'s [u8], last_end: usize, last_match: Option<usize>, } impl<'r, 's> Iterator for Matches<'r, 's> { type Item = Result<Match<'s>, Error>; fn next(&mut self) -> Option<Result<Match<'s>, Error>> { if self.last_end > self.subject.len() { return None; } let res = self.re.find_at_with_match_data( self.match_data, self.subject, self.last_end, ); let m = match res { Err(err) => return Some(Err(err)), Ok(None) => return None, Ok(Some(m)) => m, }; if m.start() == m.end() { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. self.last_end = m.end() + 1; // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(m.end()) == self.last_match { return self.next(); } } else { self.last_end = m.end(); } self.last_match = Some(m.end()); Some(Ok(m)) } } /// An iterator that yields all non-overlapping capture groups matching a /// particular regular expression. /// /// The iterator stops when no more matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'s` is the /// lifetime of the subject string. pub struct CaptureMatches<'r, 's> { re: &'r Regex, subject: &'s [u8], last_end: usize, last_match: Option<usize>, } impl<'r, 's> Iterator for CaptureMatches<'r, 's> { type Item = Result<Captures<'s>, Error>; fn next(&mut self) -> Option<Result<Captures<'s>, Error>> { if self.last_end > self.subject.len() { return None; } let mut locs = self.re.capture_locations(); let res = self.re.captures_read_at( &mut locs, self.subject, self.last_end, ); let m = match res { Err(err) => return Some(Err(err)), Ok(None) => return None, Ok(Some(m)) => m, }; if m.start() == m.end() { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. self.last_end = m.end() + 1; // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(m.end()) == self.last_match { return self.next(); } } else { self.last_end = m.end(); } self.last_match = Some(m.end()); Some(Ok(Captures { subject: self.subject, locs: locs, idx: Arc::clone(&self.re.capture_names_idx), })) } } #[cfg(test)] mod tests { use super::{Regex, RegexBuilder}; use crate::is_jit_available; fn b(string: &str) -> &[u8] { string.as_bytes() } fn find_iter_tuples(re: &Regex, subject: &[u8]) -> Vec<(usize, usize)> { let mut tuples = vec![]; for result in re.find_iter(subject) { let m = result.unwrap(); tuples.push((m.start(), m.end())); } tuples } fn cap_iter_tuples(re: &Regex, subject: &[u8]) -> Vec<(usize, usize)> { let mut tuples = vec![]; for result in re.captures_iter(subject) { let caps = result.unwrap(); let m = caps.get(0).unwrap(); tuples.push((m.start(), m.end())); } tuples } #[test] fn caseless() { let re = RegexBuilder::new() .caseless(true) .build("a") .unwrap(); assert!(re.is_match(b("A")).unwrap()); let re = RegexBuilder::new() .caseless(true) .ucp(true) .build("β") .unwrap(); assert!(re.is_match(b("Β")).unwrap()); } #[test] fn crlf() { let re = RegexBuilder::new() .crlf(true) .build("a$") .unwrap(); let m = re.find(b("a\r\n")).unwrap().unwrap(); assert_eq!(m.as_pair(), (0, 1)); } #[test] fn dotall() { let re = RegexBuilder::new() .dotall(false) .build(".") .unwrap(); assert!(!re.is_match(b("\n")).unwrap()); let re = RegexBuilder::new() .dotall(true) .build(".") .unwrap(); assert!(re.is_match(b("\n")).unwrap()); } #[test] fn extended() { let re = RegexBuilder::new() .extended(true) .build("a b c") .unwrap(); assert!(re.is_match(b("abc")).unwrap()); } #[test] fn multi_line() { let re = RegexBuilder::new() .multi_line(false) .build("^abc$") .unwrap(); assert!(!re.is_match(b("foo\nabc\nbar")).unwrap()); let re = RegexBuilder::new() .multi_line(true) .build("^abc$") .unwrap(); assert!(re.is_match(b("foo\nabc\nbar")).unwrap()); } #[test] fn ucp() { let re = RegexBuilder::new() .ucp(false) .build(r"\w") .unwrap(); assert!(!re.is_match(b("β")).unwrap()); let re = RegexBuilder::new() .ucp(true) .build(r"\w") .unwrap(); assert!(re.is_match(b("β")).unwrap()); } #[test] fn utf() { let re = RegexBuilder::new() .utf(false) .build(".") .unwrap(); assert_eq!(re.find(b("β")).unwrap().unwrap().as_pair(), (0, 1)); let re = RegexBuilder::new() .utf(true) .build(".") .unwrap(); assert_eq!(re.find(b("β")).unwrap().unwrap().as_pair(), (0, 2)); } #[test] fn jit4lyfe() { if is_jit_available() { let re = RegexBuilder::new() .jit(true) .build(r"\w") .unwrap(); assert!(re.is_match(b("a")).unwrap()); } else { // Check that if JIT isn't enabled, then we get an error if we // require JIT. RegexBuilder::new() .jit(true) .build(r"\w") .unwrap_err(); } } // Unlike jit4lyfe, this tests that everything works when requesting the // JIT only if it's available. In jit4lyfe, we require the JIT or fail. // If the JIT isn't available, then in this test, we simply don't use it. #[test] fn jit_if_available() { let re = RegexBuilder::new() .jit_if_available(true) .build(r"\w") .unwrap(); assert!(re.is_match(b("a")).unwrap()); } // This tests a regression caused a segfault in the pcre2 library // https://github.com/BurntSushi/rust-pcre2/issues/10 #[test] fn jit_test_lazy_alloc_subject() { let subject: Vec<u8> = vec![]; let re = RegexBuilder::new() .jit_if_available(true) .build(r"xxxx|xxxx|xxxx") .unwrap(); assert!(!re.is_match(&subject).unwrap()); } #[test] fn utf_with_invalid_data() { let re = RegexBuilder::new() .build(r".") .unwrap(); assert_eq!(re.find(b"\xFF").unwrap().unwrap().as_pair(), (0, 1)); let re = RegexBuilder::new() .utf(true) .build(r".") .unwrap(); assert!(re.find(b"\xFF").is_err()); } #[test] fn capture_names() { let re = RegexBuilder::new() .build( r"(?P<foo>abc)|(def)|(?P<a>ghi)|(?P<springsteen>jkl)" ) .unwrap(); assert_eq!(re.capture_names().to_vec(), vec![ None, Some("foo".to_string()), None, Some("a".to_string()), Some("springsteen".to_string()), ]); // Test our internal map as well. assert_eq!(re.capture_names_idx.len(), 3); assert_eq!(re.capture_names_idx["foo"], 1); assert_eq!(re.capture_names_idx["a"], 3); assert_eq!(re.capture_names_idx["springsteen"], 4); } #[test] fn captures_get() { let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); let caps = re.captures(b"abc123").unwrap().unwrap(); let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); assert_eq!(text1, &b"123"[..]); assert_eq!(text2, &b""[..]); } #[test] fn find_iter_empty() { let re = Regex::new(r"(?m:^)").unwrap(); assert_eq!(find_iter_tuples(&re, b""), vec![(0, 0)]); assert_eq!(find_iter_tuples(&re, b"\n"), vec![(0, 0)]); assert_eq!(find_iter_tuples(&re, b"\n\n"), vec![(0, 0), (1, 1)]); assert_eq!(find_iter_tuples(&re, b"\na\n"), vec![(0, 0), (1, 1)]); assert_eq!(find_iter_tuples(&re, b"\na\n\n"), vec![ (0, 0), (1, 1), (3, 3), ]); } #[test] fn captures_iter_empty() { let re = Regex::new(r"(?m:^)").unwrap(); assert_eq!(cap_iter_tuples(&re, b""), vec![(0, 0)]); assert_eq!(cap_iter_tuples(&re, b"\n"), vec![(0, 0)]); assert_eq!(cap_iter_tuples(&re, b"\n\n"), vec![(0, 0), (1, 1)]); assert_eq!(cap_iter_tuples(&re, b"\na\n"), vec![(0, 0), (1, 1)]); assert_eq!(cap_iter_tuples(&re, b"\na\n\n"), vec![ (0, 0), (1, 1), (3, 3), ]); } #[test] fn max_jit_stack_size_does_something() { if !is_jit_available() { return; } let hundred = "\ ABCDEFGHIJKLMNOPQRSTUVWXY\ ABCDEFGHIJKLMNOPQRSTUVWXY\ ABCDEFGHIJKLMNOPQRSTUVWXY\ ABCDEFGHIJKLMNOPQRSTUVWXY\ "; let hay = format!("{}", hundred.repeat(100)); // First, try a regex that checks that we can blow the JIT stack limit. let re = RegexBuilder::new() .ucp(true) .jit(true) .max_jit_stack_size(Some(1)) .build(r"((((\w{10})){100}))+") .unwrap(); let result = re.is_match(hay.as_bytes()); if result.is_ok() { // Skip this test, since for some reason we weren't able to blow // the stack limit. return; } let err = result.unwrap_err(); assert!(err.to_string().contains("JIT stack limit reached")); // Now bump up the JIT stack limit and check that it succeeds. let re = RegexBuilder::new() .ucp(true) .jit(true) .max_jit_stack_size(Some(1<<20)) .build(r"((((\w{10})){100}))+") .unwrap(); assert!(re.is_match(hay.as_bytes()).unwrap()); } } ���������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������pcre2-0.2.2/src/error.rs����������������������������������������������������������������������������0100644�0001750�0000144�00000013231�13454714415�0013241�0����������������������������������������������������������������������������������������������������ustar�00����������������������������������������������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������use std::error; use std::fmt; use libc::c_int; use pcre2_sys::*; /// A PCRE2 error. /// /// An error can occur during compilation or during matching. The kind of this /// error indicates the type of operation being performed when the error /// occurred. #[derive(Clone)] pub struct Error { kind: ErrorKind, code: c_int, offset: Option<usize>, } /// The kind of an error indicates the type of operation that was attempted /// that resulted in an error. /// /// This enum may expand over time. #[derive(Clone, Debug)] pub enum ErrorKind { /// An error occurred during compilation of a regex. Compile, /// An error occurred during JIT compilation of a regex. JIT, /// An error occurred while matching. Match, /// An error occurred while querying a compiled regex for info. Info, /// An error occurred while setting an option. Option, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } impl Error { /// Create a new compilation error. pub(crate) fn compile(code: c_int, offset: usize) -> Error { Error { kind: ErrorKind::Compile, code: code, offset: Some(offset), } } /// Create a new JIT compilation error. pub(crate) fn jit(code: c_int) -> Error { Error { kind: ErrorKind::JIT, code: code, offset: None, } } /// Create a new matching error. pub(crate) fn matching(code: c_int) -> Error { Error { kind: ErrorKind::Match, code: code, offset: None, } } /// Create a new info error. pub(crate) fn info(code: c_int) -> Error { Error { kind: ErrorKind::Info, code: code, offset: None, } } /// Create a new option error. pub(crate) fn option(code: c_int) -> Error { Error { kind: ErrorKind::Option, code: code, offset: None, } } /// Return the kind of this error. /// /// The kind indicates the type of operation that was attempted which /// resulted in this error. pub fn kind(&self) -> &ErrorKind { &self.kind } /// Return the raw underlying PCRE2 error code. /// /// This can be useful if one needs to determine exactly which error /// occurred, which can be done with case analysis over the constants /// exported in the `pcre2-sys` crate. pub fn code(&self) -> c_int { self.code } /// Return the underlying offset associated with this error, if one exists. /// /// The offset is typically only available for compile time errors, and /// is supposed to indicate the general position in the pattern where an /// error occurred. pub fn offset(&self) -> Option<usize> { self.offset } /// Returns the error message from PCRE2. fn error_message(&self) -> String { // PCRE2 docs say a buffer size of 120 bytes is enough, but we're // cautious and double it. let mut buf = [0u8; 240]; let rc = unsafe { pcre2_get_error_message_8(self.code, buf.as_mut_ptr(), buf.len()) }; // Errors are only ever constructed from codes reported by PCRE2, so // our code should always be valid. assert!(rc != PCRE2_ERROR_BADDATA, "used an invalid error code"); // PCRE2 docs claim 120 bytes is enough, and we use more, so... assert!(rc != PCRE2_ERROR_NOMEMORY, "buffer size too small"); // Sanity check that we do indeed have a non-negative result. 0 is OK. assert!(rc >= 0, "expected non-negative but got {}", rc); String::from_utf8(buf[..rc as usize].to_vec()).expect("valid UTF-8") } } impl error::Error for Error { fn description(&self) -> &str { "pcre2 error" } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let msg = self.error_message(); match self.kind { ErrorKind::Compile => { match self.offset { None => { write!(f, "PCRE2: error compiling pattern: {}", msg) } Some(offset) => { write!( f, "PCRE2: error compiling pattern at offset {}: {}", offset, msg ) } } } ErrorKind::JIT => { write!(f, "PCRE2: error JIT compiling pattern: {}", msg) } ErrorKind::Match => { write!(f, "PCRE2: error matching: {}", msg) } ErrorKind::Info => { write!(f, "PCRE2: error getting info: {}", msg) } ErrorKind::Option => { write!(f, "PCRE2: error setting option: {}", msg) } _ => unreachable!(), } } } impl fmt::Debug for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { // We include the error message in the debug representation since // most humans probably don't have PCRE2 error codes memorized. f.debug_struct("Error") .field("kind", &self.kind) .field("code", &self.code) .field("offset", &self.offset) .field("message", &self.error_message()) .finish() } } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������pcre2-0.2.2/src/ffi.rs������������������������������������������������������������������������������0100664�0001750�0000144�00000036605�13534037057�0012667�0����������������������������������������������������������������������������������������������������ustar�00����������������������������������������������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������/*! This module defines a low level and *mostly* safe abstraction around the core PCRE2 regex primitives. Callers may still need to deal with some unsafety, but this layer will take care of the obvious things, such as resource management and error handling. */ use std::cmp; use std::ptr; use std::slice; use libc::c_void; use pcre2_sys::*; use crate::error::Error; /// Returns true if and only if PCRE2 believes that JIT is available. pub fn is_jit_available() -> bool { let mut rc: u32 = 0; let error_code = unsafe { pcre2_config_8(PCRE2_CONFIG_JIT, &mut rc as *mut _ as *mut c_void) }; if error_code < 0 { // If PCRE2_CONFIG_JIT is a bad option, then there's a bug somewhere. panic!("BUG: {}", Error::jit(error_code)); } rc == 1 } /// Returns the version of PCRE2 being used. /// /// The tuple returned corresponds to the major and minor version, e.g., /// `(10, 32)`. pub fn version() -> (u32, u32) { (PCRE2_MAJOR, PCRE2_MINOR) } /// A low level representation of a compiled PCRE2 code object. pub struct Code { code: *mut pcre2_code_8, compiled_jit: bool, // We hang on to this but don't use it so that it gets freed when the // compiled code gets freed. It's not clear whether this is necessary or // not, but presumably doesn't cost us much to be conservative. #[allow(dead_code)] ctx: CompileContext, } // SAFETY: Compiled PCRE2 code objects are immutable once built and explicitly // safe to use from multiple threads simultaneously. // // One hitch here is that JIT compiling can write into a PCRE2 code object, but // we only ever JIT compile immediately after first building the code object // and before making it available to the caller. unsafe impl Send for Code {} unsafe impl Sync for Code {} impl Drop for Code { fn drop(&mut self) { unsafe { pcre2_code_free_8(self.code) } } } impl Code { /// Compile the given pattern with the given options. If there was a /// problem compiling the pattern, then return an error. pub fn new( pattern: &str, options: u32, mut ctx: CompileContext, ) -> Result<Code, Error> { let (mut error_code, mut error_offset) = (0, 0); let code = unsafe { pcre2_compile_8( pattern.as_ptr(), pattern.len(), options, &mut error_code, &mut error_offset, ctx.as_mut_ptr(), ) }; if code.is_null() { Err(Error::compile(error_code, error_offset)) } else { Ok(Code { code, compiled_jit: false, ctx }) } } /// JIT compile this code object. /// /// If there was a problem performing JIT compilation, then this returns /// an error. pub fn jit_compile(&mut self) -> Result<(), Error> { let error_code = unsafe { pcre2_jit_compile_8(self.code, PCRE2_JIT_COMPLETE) }; if error_code == 0 { self.compiled_jit = true; Ok(()) } else { Err(Error::jit(error_code)) } } /// Build and return an ordered sequence of all capture group names in this /// compiled regex. /// /// The returned vector has a slot for every capturing group (including the /// one corresponding to the entire regex, which is always unnamed). Groups /// that are unnamed are set to `None`. /// /// If there was a problem querying the compiled object for information, /// then this returns an error. pub fn capture_names(&self) -> Result<Vec<Option<String>>, Error> { // This is an object lesson in why C sucks. All we need is a map from // a name to a number, but we need to go through all sorts of // shenanigans to get it. In order to verify this code, see // https://www.pcre.org/current/doc/html/pcre2api.html // and search for PCRE2_INFO_NAMETABLE. let name_count = self.name_count()?; let size = self.name_entry_size()?; let table = unsafe { slice::from_raw_parts(self.raw_name_table()?, name_count * size) }; let mut names = vec![None; self.capture_count()?]; for i in 0..name_count { let entry = &table[i * size..(i + 1) * size]; let name = &entry[2..]; let nulat = name .iter() .position(|&b| b == 0) .expect("a NUL in name table entry"); let index = (entry[0] as usize) << 8 | (entry[1] as usize); names[index] = String::from_utf8(name[..nulat].to_vec()) .map(Some) // We require our pattern to be valid UTF-8, so all capture // names should also be valid UTF-8. .expect("valid UTF-8 for capture name"); } Ok(names) } /// Return the underlying raw pointer to the code object. pub fn as_ptr(&self) -> *const pcre2_code_8 { self.code } /// Returns the raw name table, where each entry in the table corresponds /// to a mapping between a named capturing group and the index of that /// capturing group. The encoding for each item is as follows: /// /// * 2 bytes encoding the capture index (big-endian) /// * N bytes encoding the code units of the name /// * 1 byte for the NUL terminator /// * M padding bytes, corresponding to the difference in length between /// this name and the longest name. /// /// In particular, each entry uses the same number of bytes. /// /// Entries are in alphabetical order. fn raw_name_table(&self) -> Result<*const u8, Error> { let mut bytes: *const u8 = ptr::null(); let rc = unsafe { pcre2_pattern_info_8( self.as_ptr(), PCRE2_INFO_NAMETABLE, &mut bytes as *mut *const u8 as *mut c_void, ) }; if rc != 0 { Err(Error::info(rc)) } else { Ok(bytes) } } /// Returns the number of named capturing groups. fn name_count(&self) -> Result<usize, Error> { let mut count: u32 = 0; let rc = unsafe { pcre2_pattern_info_8( self.as_ptr(), PCRE2_INFO_NAMECOUNT, &mut count as *mut u32 as *mut c_void, ) }; if rc != 0 { Err(Error::info(rc)) } else { Ok(count as usize) } } /// Returns the entry size of each name in the name table. /// /// This appears to correspond to `3` plus the size of the longest named /// capturing group. The extra 3 bytes correspond to a NUL terminator and /// two prefix bytes corresponding to a big-endian encoding of the index /// of the capture group. fn name_entry_size(&self) -> Result<usize, Error> { let mut size: u32 = 0; let rc = unsafe { pcre2_pattern_info_8( self.as_ptr(), PCRE2_INFO_NAMEENTRYSIZE, &mut size as *mut u32 as *mut c_void, ) }; if rc != 0 { Err(Error::info(rc)) } else { Ok(size as usize) } } /// Returns the total number of capturing groups in this regex. This /// includes the capturing group for the entire pattern, so that this is /// always 1 more than the number of syntactic groups in the pattern. pub fn capture_count(&self) -> Result<usize, Error> { let mut count: u32 = 0; let rc = unsafe { pcre2_pattern_info_8( self.as_ptr(), PCRE2_INFO_CAPTURECOUNT, &mut count as *mut u32 as *mut c_void, ) }; if rc != 0 { Err(Error::info(rc)) } else { Ok(1 + count as usize) } } } /// A low level representation of PCRE2's compilation context. pub struct CompileContext(*mut pcre2_compile_context_8); // SAFETY: Compile contexts are safe to read from multiple threads // simultaneously. No interior mutability is used, so Sync is safe. unsafe impl Send for CompileContext {} unsafe impl Sync for CompileContext {} impl Drop for CompileContext { fn drop(&mut self) { unsafe { pcre2_compile_context_free_8(self.0) } } } impl CompileContext { /// Create a new empty compilation context. /// /// If memory could not be allocated for the context, then this panics. pub fn new() -> CompileContext { let ctx = unsafe { pcre2_compile_context_create_8(ptr::null_mut()) }; assert!(!ctx.is_null(), "could not allocate compile context"); CompileContext(ctx) } /// Set the PCRE2 newline sequence. /// /// Valid values are: PCRE2_NEWLINE_CR, PCRE2_NEWLINE_LF, /// PCRE2_NEWLINE_CRLF, PCRE2_NEWLINE_ANYCRLF, PCRE2_NEWLINE_ANY or /// PCRE2_NEWLINE_NUL. Using any other value results in an error. pub fn set_newline(&mut self, value: u32) -> Result<(), Error> { let rc = unsafe { pcre2_set_newline_8(self.0, value) }; if rc == 0 { Ok(()) } else { Err(Error::option(rc)) } } fn as_mut_ptr(&mut self) -> *mut pcre2_compile_context_8 { self.0 } } /// Configuration for PCRE2's match context. #[derive(Clone, Debug)] pub struct MatchConfig { /// When set, a custom JIT stack will be created with the given maximum /// size. pub max_jit_stack_size: Option<usize>, } impl Default for MatchConfig { fn default() -> MatchConfig { MatchConfig { max_jit_stack_size: None, } } } /// A low level representation of a match data block. /// /// Technically, a single match data block can be used with multiple regexes /// (not simultaneously), but in practice, we just create a single match data /// block for each regex for each thread it's used in. pub struct MatchData { config: MatchConfig, match_context: *mut pcre2_match_context_8, match_data: *mut pcre2_match_data_8, jit_stack: Option<*mut pcre2_jit_stack_8>, ovector_ptr: *const usize, ovector_count: u32, } // SAFETY: Match data blocks can be freely sent from one thread to another, // but they do not support multiple threads using them simultaneously. We still // implement Sync however, since we require mutable access to use the match // data block for executing a search, which statically prevents simultaneous // reading/writing. It is legal to read match data blocks from multiple threads // simultaneously. unsafe impl Send for MatchData {} unsafe impl Sync for MatchData {} impl Drop for MatchData { fn drop(&mut self) { unsafe { if let Some(stack) = self.jit_stack { pcre2_jit_stack_free_8(stack); } pcre2_match_data_free_8(self.match_data); pcre2_match_context_free_8(self.match_context); } } } impl MatchData { /// Create a new match data block from a compiled PCRE2 code object. /// /// This panics if memory could not be allocated for the block. pub fn new(config: MatchConfig, code: &Code) -> MatchData { let match_context = unsafe { pcre2_match_context_create_8(ptr::null_mut()) }; assert!(!match_context.is_null(), "failed to allocate match context"); let match_data = unsafe { pcre2_match_data_create_from_pattern_8( code.as_ptr(), ptr::null_mut(), ) }; assert!(!match_data.is_null(), "failed to allocate match data block"); let jit_stack = match config.max_jit_stack_size { None => None, Some(_) if !code.compiled_jit => None, Some(max) => { let stack = unsafe { pcre2_jit_stack_create_8( cmp::min(max, 32 * 1<<10), max, ptr::null_mut(), ) }; assert!(!stack.is_null(), "failed to allocate JIT stack"); unsafe { pcre2_jit_stack_assign_8( match_context, None, stack as *mut c_void, ) }; Some(stack) } }; let ovector_ptr = unsafe { pcre2_get_ovector_pointer_8(match_data) }; assert!(!ovector_ptr.is_null(), "got NULL ovector pointer"); let ovector_count = unsafe { pcre2_get_ovector_count_8(match_data) }; MatchData { config, match_context, match_data, jit_stack, ovector_ptr, ovector_count, } } /// Return the configuration for this match data object. pub fn config(&self) -> &MatchConfig { &self.config } /// Execute PCRE2's primary match routine on the given subject string /// starting at the given offset. The provided options are passed to PCRE2 /// as is. /// /// This returns false if no match occurred. /// /// Match offsets can be extracted via `ovector`. /// /// # Safety /// /// This routine is marked unsafe because it allows the caller to set /// arbitrary PCRE2 options. Some of those options can invoke undefined /// behavior when not used correctly. For example, if PCRE2_NO_UTF_CHECK /// is given and UTF mode is enabled and the given subject string is not /// valid UTF-8, then the result is undefined. pub unsafe fn find( &mut self, code: &Code, mut subject: &[u8], start: usize, options: u32, ) -> Result<bool, Error> { // When the subject is empty, we use an empty slice // with a known valid pointer. Otherwise, slices derived // from, e.g., an empty `Vec<u8>` may not have a valid // pointer, since creating an empty `Vec` is guaranteed // to not allocate. const EMPTY: &[u8] = &[]; if subject.is_empty() { subject = EMPTY; } let rc = pcre2_match_8( code.as_ptr(), subject.as_ptr(), subject.len(), start, options, self.as_mut_ptr(), self.match_context, ); if rc == PCRE2_ERROR_NOMATCH { Ok(false) } else if rc > 0 { Ok(true) } else { // We always create match data with // pcre2_match_data_create_from_pattern, so the ovector should // always be big enough. assert!(rc != 0, "ovector should never be too small"); Err(Error::matching(rc)) } } /// Return a mutable reference to the underlying match data. fn as_mut_ptr(&mut self) -> *mut pcre2_match_data_8 { self.match_data } /// Return the ovector corresponding to this match data. /// /// The ovector represents match offsets as pairs. This always returns /// N + 1 pairs (so 2*N + 1 offsets), where N is the number of capturing /// groups in the original regex. pub fn ovector(&self) -> &[usize] { // SAFETY: Both our ovector pointer and count are derived directly from // the creation of a valid match data block. One interesting question // here is whether the contents of the ovector are always initialized. // The PCRE2 documentation suggests that they are (so does testing), // but this isn't actually 100% clear! unsafe { slice::from_raw_parts( self.ovector_ptr, self.ovector_count as usize * 2, ) } } } ���������������������������������������������������������������������������������������������������������������������������pcre2-0.2.2/src/lib.rs������������������������������������������������������������������������������0100644�0001750�0000144�00000001133�13454714415�0012654�0����������������������������������������������������������������������������������������������������ustar�00����������������������������������������������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������/*! This crate provides a safe high level Rust binding to [PCRE2](https://www.pcre.org/). The API of this crate attempts to correspond closely to the API of Rust's [`regex`](https://docs.rs/regex) crate. The API provided by this crate neither matches the full API of Rust's regex crate nor does it expose the full functionality of PCRE2. Contributions are welcome to improve this. */ #![deny(missing_docs)] pub use crate::error::{Error, ErrorKind}; pub use crate::ffi::{is_jit_available, version}; /** PCRE2 regular expressions for matching on arbitrary bytes. */ pub mod bytes; mod error; mod ffi; �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������pcre2-0.2.2/.cargo_vcs_info.json��������������������������������������������������������������������0000644�����������������00000000112�00000000000�0012101�0����������������������������������������������������������������������������������������������������ustar�00�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������{ "git": { "sha1": "1cc3194b8ca5e9fc76dd9d6717923e862df7d136" } } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������