onig-4.3.2/Cargo.toml.orig010064400007650000024000000015051342760711200135660ustar0000000000000000[package] name = "onig" version = "4.3.2" authors = [ "Will Speak ", "Ivan Ivashchenko " ] description = """ Rust-Onig is a set of Rust bindings for the Oniguruma regular expression library. Oniguruma is a modern regex library with support for multiple character encodings and regex syntaxes. """ repository = "http://github.com/iwillspeak/rust-onig" documentation = "https://docs.rs/onig/" readme = "../README.md" license = "MIT" [features] std-pattern = [] # include regexec() posix-api = ["onig_sys/posix-api"] # Make Oniguruma print debug output for parsing/compiling and executing print-debug = ["onig_sys/print-debug"] [dependencies] bitflags = "1.0" lazy_static = "1.2" [target.'cfg(windows)'.dependencies] libc = "0.2" [dependencies.onig_sys] version = "69.1.0" path = "../onig_sys" onig-4.3.2/Cargo.toml0000644000000023720000000000000100370ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g. crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "onig" version = "4.3.2" authors = ["Will Speak ", "Ivan Ivashchenko "] description = "Rust-Onig is a set of Rust bindings for the\nOniguruma regular expression library. Oniguruma\nis a modern regex library with support for\nmultiple character encodings and regex syntaxes.\n" documentation = "https://docs.rs/onig/" readme = "../README.md" license = "MIT" repository = "http://github.com/iwillspeak/rust-onig" [dependencies.bitflags] version = "1.0" [dependencies.lazy_static] version = "1.2" [dependencies.onig_sys] version = "69.1.0" [features] posix-api = ["onig_sys/posix-api"] print-debug = ["onig_sys/print-debug"] std-pattern = [] [target."cfg(windows)".dependencies.libc] version = "0.2" onig-4.3.2/examples/capturedump.rs010064400007650000024000000020051341641542700154140ustar0000000000000000extern crate onig; use onig::*; use std::env; use std::io; use std::io::prelude::*; use std::collections::HashMap; fn main() { let mut regexes = HashMap::new(); for arg in env::args().skip(1) { println!("Compiling '{}'", arg); let regex_compilation = Regex::new(&arg); match regex_compilation { Ok(regex) => { regexes.insert(arg, regex); } Err(error) => { panic!("{:?}", error); } } } let stdin = io::stdin(); for line in stdin.lock().lines() { if let Ok(line) = line { for (name, regex) in regexes.iter() { let res = regex.captures(&line); match res { Some(captures) => for (i, mat) in captures.iter().enumerate() { println!("{} => '{}'", i, mat.unwrap()); }, None => println!("{} => did not match", name), } } } } } onig-4.3.2/examples/dollar.rs010064400007650000024000000041361341641542700143470ustar0000000000000000extern crate onig; use onig::{Captures, Regex, Replacer}; use std::borrow::Cow; /// A string, with `$1` refering to the first capture group. struct Dollarified<'a>(&'a str); /// Capture Reference to Captured String /// /// Tries to convert a refernece to a capture to the captured text. If /// the reference isn't a valid numeric capture group then no text is /// returned. fn capture_str<'t>(caps: &'t Captures, cap_ref: &str) -> Option<&'t str> { cap_ref.parse::().ok().and_then(|p| caps.at(p)) } impl<'a> Replacer for Dollarified<'a> { fn reg_replace(&mut self, caps: &Captures) -> Cow { let mut replacement = String::new(); let mut pattern = self.0; while !pattern.is_empty() { if let Some(position) = pattern.find('$') { // push up to the replacement replacement.push_str(&pattern[..position]); pattern = &pattern[position + 1..]; // find the end of the capture reference let ref_end = pattern .find(|c| !char::is_numeric(c)) .unwrap_or(pattern.len()); // push the capture from this capture reference if let Some(cap) = capture_str(caps, &pattern[..ref_end]) { replacement.push_str(cap); pattern = &pattern[ref_end..]; } else { replacement.push('$'); } } else { // no replacements left replacement.push_str(pattern); break; } } replacement.into() } } fn test_with(replacement: &str) { let re = Regex::new(r"(\w+) (\w+)").unwrap(); let hay = "well (hello world) to you!"; println!( "/{}/{}/ -> {}", &hay, &replacement, re.replace(hay, Dollarified(replacement)) ); } fn main() { test_with("$2 $1"); test_with("($2 $1)"); test_with("|$2|$1|"); test_with("|$0|$2$1"); test_with("$$$"); test_with("$$$3"); test_with("$$2$3"); test_with("Literal replacement"); } onig-4.3.2/examples/listcap.rs010064400007650000024000000024111341641542700145230ustar0000000000000000extern crate onig; use onig::*; fn ex(hay: &str, pattern: &str, syntax: &Syntax) { let reg = Regex::with_options(pattern, RegexOptions::REGEX_OPTION_NONE, syntax).unwrap(); println!("number of captures: {}", reg.captures_len()); println!( "number of capture histories: {}", reg.capture_histories_len() ); let mut region = Region::new(); let r = reg.search_with_options( hay, 0, hay.len(), SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), ); if let Some(pos) = r { println!("match at {}", pos); for (i, (start, end)) in region.iter().enumerate() { println!("{}: ({}-{})", i, start, end); } region.tree_traverse(|i, (start, end), level| { println!("{}{}: ({}-{})", " ".repeat(level as usize), i, start, end); true }); } else { println!("search fail"); } } fn main() { let mut syn = Syntax::default().clone(); syn.enable_operators(SyntaxOperator::SYNTAX_OPERATOR_ATMARK_CAPTURE_HISTORY); ex( "((())())", "\\g

(?@

\\(\\g\\)){0}(?@(?:\\g

)*|){0}", &syn, ); ex("x00x00x00", "(?@x(?@\\d+))+", &syn); ex("0123", "(?@.)(?@.)(?@.)(?@.)", &syn); } onig-4.3.2/examples/names.rs010064400007650000024000000016271341641542700141770ustar0000000000000000extern crate onig; use onig::*; fn main() { let pattern = "(?a*)(?b*)(?c*)"; let string = "aaabbbbcc"; let r = Regex::new(pattern).unwrap(); println!("has {} group names:", r.capture_names_len()); for (name, indices) in r.capture_names() { println!("- {}: {:?}", name, indices); } let mut region = Region::new(); if let Some(position) = r.search_with_options( string, 0, string.len(), SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), ) { println!("match at {} in {:?}", position, string); r.foreach_name(|name, groups| { for group in groups { let pos = region.pos(*group as usize).unwrap(); println!("- {} ({}): {} - {}", name, group, pos.0, pos.1); } true }); } else { println!("search fail") } } onig-4.3.2/examples/scan.rs010064400007650000024000000011231341641542700140070ustar0000000000000000extern crate onig; use onig::*; fn scan_callback<'t>(n: i32, caps: Captures<'t>) -> bool { println!("scan: {}", n); println!("match at {}", caps.offset()); for (i, cap) in caps.iter_pos().enumerate() { match cap { Some(pos) => println!("{}: {:?}", i, pos), None => println!("{}: did not capture", i), } } true } fn exec(pattern: &str, to_match: &str) { let reg = Regex::new(pattern).unwrap(); reg.scan(to_match, scan_callback); } fn main() { exec("\\Ga+\\s*", "a aa aaa baaa"); exec("a+\\s*", "a aa aaa baaa"); } onig-4.3.2/examples/simple.rs010064400007650000024000000010731341641542700143600ustar0000000000000000extern crate onig; use onig::*; fn main() { let pattern = "a(.*)b|[e-f]+"; let string = "zzzzaffffffffb"; let r = Regex::new(pattern).unwrap(); match r.captures(string) { Some(caps) => { println!("match at {}", caps.offset()); for (i, cap) in caps.iter_pos().enumerate() { match cap { Some(pos) => println!("{}: {:?}", i, pos), None => println!("{}: did not capture", i), } } } None => println!("search fail"), } } onig-4.3.2/examples/simple_grep.rs010064400007650000024000000023671341641542700154040ustar0000000000000000extern crate onig; use onig::*; use std::env; use std::io; use std::io::prelude::*; use std::collections::HashMap; fn main() { let mut regexes = HashMap::new(); for arg in env::args().skip(1) { println!("Compiling '{}'", arg); let regex_compilation = Regex::with_options( &arg, onig::RegexOptions::REGEX_OPTION_SINGLELINE, onig::Syntax::emacs(), ); match regex_compilation { Ok(regex) => { regexes.insert(arg, regex); } Err(error) => { panic!("{:?}", error); } } } let stdin = io::stdin(); for line in stdin.lock().lines() { if let Ok(line) = line { for (name, regex) in regexes.iter() { let res = regex.search_with_options( &line, 0, line.len(), onig::SearchOptions::SEARCH_OPTION_NONE, None, ); match res { Some(pos) => println!("{} => matched @ {}", name, pos), None => println!("{} => did not match", name), } } } } println!("done"); } onig-4.3.2/examples/sql.rs010064400007650000024000000026511341641542700136710ustar0000000000000000extern crate onig; extern crate onig_sys; use onig::*; fn main() { let mut syntax = Syntax::default().clone(); syntax.set_operators(SyntaxOperator::SYNTAX_OPERATOR_VARIABLE_META_CHARACTERS); syntax.set_behavior(SyntaxBehavior::empty()); syntax.set_options(RegexOptions::REGEX_OPTION_MULTILINE); syntax.set_meta_char(MetaCharType::META_CHAR_ESCAPE, MetaChar::Character('\\')); syntax.set_meta_char(MetaCharType::META_CHAR_ANYCHAR, MetaChar::Character('_')); syntax.set_meta_char(MetaCharType::META_CHAR_ANYTIME, MetaChar::Ineffective); syntax.set_meta_char( MetaCharType::META_CHAR_ZERO_OR_ONE_TIME, MetaChar::Ineffective, ); syntax.set_meta_char( MetaCharType::META_CHAR_ONE_OR_MORE_TIME, MetaChar::Ineffective, ); syntax.set_meta_char( MetaCharType::META_CHAR_ANYCHAR_ANYTIME, MetaChar::Character('%'), ); let reg = Regex::with_options("\\_%\\\\__zz", RegexOptions::REGEX_OPTION_NONE, &syntax).unwrap(); match reg.captures("a_abcabcabc\\ppzz") { Some(caps) => { println!("match at {}", caps.offset()); for (i, cap) in caps.iter_pos().enumerate() { match cap { Some(pos) => println!("{}: {:?}", i, pos), None => println!("{}: did not capture", i), } } } None => println!("search fail"), } } onig-4.3.2/examples/syntax.rs010064400007650000024000000017431341641542700144210ustar0000000000000000extern crate onig; use onig::*; fn exec(syntax: &Syntax, pattern: &str, to_search: &str) { let reg = Regex::with_options(pattern, RegexOptions::REGEX_OPTION_NONE, syntax).unwrap(); match reg.captures(to_search) { Some(caps) => { println!("match at {}", caps.offset()); for (i, cap) in caps.iter_pos().enumerate() { match cap { Some(pos) => println!("{}: {:?}", i, pos), None => println!("{}: did not capture", i), } } } None => println!("search fail"), } } fn main() { exec( Syntax::perl(), r"\p{XDigit}\P{XDigit}\p{^XDigit}\P{^XDigit}\p{XDigit}", "bgh3a", ); exec(Syntax::java(), r"\p{XDigit}\P{XDigit}[a-c&&b-g]", "bgc"); exec( Syntax::asis(), r"abc def* e+ g?ddd[a-rvvv] (vv){3,7}hv\dvv(?:aczui ss)\W\w$", r"abc def* e+ g?ddd[a-rvvv] (vv){3,7}hv\dvv(?:aczui ss)\W\w$", ); } onig-4.3.2/examples/user_property.rs010064400007650000024000000020221341641542700160040ustar0000000000000000extern crate onig; use std::str; use onig::*; fn main() { define_user_property( "HandakuonHiragana", &[ (0x3071, 0x3071), // PA (0x3074, 0x3074), // PI (0x3077, 0x3077), // PU (0x307a, 0x307a), // PE (0x307d, 0x307d), // PO ], ); // "PA PI PU PE PO a" let hay = [ 0xe3, 0x81, 0xb1, 0xe3, 0x81, 0xb4, 0xe3, 0x81, 0xb7, 0xe3, 0x81, 0xba, 0xe3, 0x81, 0xbd, 'a' as u8, ]; let hay = str::from_utf8(&hay).unwrap(); let reg = Regex::new("\\A(\\p{HandakuonHiragana}{5})\\p{^HandakuonHiragana}\\z").unwrap(); match reg.captures(hay) { Some(caps) => { println!("match at {}", caps.offset()); for (i, cap) in caps.iter_pos().enumerate() { match cap { Some(pos) => println!("{}: {:?}", i, pos), None => println!("{}: did not capture", i), } } } None => println!("search fail"), } } onig-4.3.2/LICENSE.md010064400007650000024000000026601334471544600123160ustar0000000000000000# Rust-Onig is Open Source! All source code in this repository is distributed under the terms of the *MIT License* unless otherwise stated. The Oniguruma source code remains the property of the original authors and is re-distributed under the original license. > The MIT License (MIT) > > Copyright (c) 2015 Will Speak , Ivan Ivashchenko > , and contributors. > > Permission is hereby granted, free of charge, to any person obtaining a copy > of this software and associated documentation files (the "Software"), to deal > in the Software without restriction, including without limitation the rights > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell > copies of the Software, and to permit persons to whom the Software is > furnished to do so, subject to the following conditions: > > The above copyright notice and this permission notice shall be included in all > copies or substantial portions of the Software. > > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE > SOFTWARE. onig-4.3.2/src/buffers.rs010064400007650000024000000107011341641542700134720ustar0000000000000000//! Encoded Buffers Implementation //! //! This module contains a trait used for converting byte buffers or //! Rust strings into oniguruma char buffers to search and compile //! with. use onig_sys; /// Encoded String Buffer /// /// Represents a buffer of characters with encoding information /// attached. pub trait EncodedChars { /// Pointer to the start of the pattern /// /// This should point to the first character in the buffer, /// encoded as an `onig_sys` character. fn start_ptr(&self) -> *const onig_sys::OnigUChar; /// Pointer to the limit of the pattern buffer /// /// This should point just past the final character in the buffer, /// encoded as an `onig_sys` character. fn limit_ptr(&self) -> *const onig_sys::OnigUChar; /// The encoding of the contents of the buffer fn encoding(&self) -> onig_sys::OnigEncoding { unsafe { &onig_sys::OnigEncodingUTF8 } } /// The length of this buffer fn len(&self) -> usize; /// Is the buffer empty? fn is_empty(&self) -> bool { self.len() == 0 } } /// Encoded Charters from a `str` Reference impl EncodedChars for T where T: AsRef, { fn start_ptr(&self) -> *const onig_sys::OnigUChar { self.as_ref().as_bytes().as_ptr() } fn limit_ptr(&self) -> *const onig_sys::OnigUChar { let bytes = self.as_ref().as_bytes(); bytes[bytes.len()..].as_ptr() } fn len(&self) -> usize { self.as_ref().len() } } /// Byte Buffer /// /// Represents a buffer of bytes, with an encoding. pub struct EncodedBytes<'a> { bytes: &'a [u8], enc: onig_sys::OnigEncoding, } impl<'a> EncodedBytes<'a> { /// New Buffer from Parts /// /// # Arguments /// /// * `bytes` - The contents of the buffer /// * `enc` - The encoding this buffer is in /// /// # Returns /// /// A new buffer instance pub fn from_parts(bytes: &'a [u8], enc: onig_sys::OnigEncoding) -> EncodedBytes<'a> { EncodedBytes { bytes, enc, } } /// New ASCII Buffer /// /// # Arguments /// /// * `bytes` - The ASCII encoded string /// /// # Returns /// /// A new buffer instance pub fn ascii(bytes: &'a [u8]) -> EncodedBytes<'a> { EncodedBytes { bytes, enc: unsafe { &onig_sys::OnigEncodingASCII }, } } } impl<'a> EncodedChars for EncodedBytes<'a> { fn start_ptr(&self) -> *const onig_sys::OnigUChar { self.bytes.as_ptr() } fn limit_ptr(&self) -> *const onig_sys::OnigUChar { self.bytes[self.bytes.len()..].as_ptr() } fn encoding(&self) -> onig_sys::OnigEncoding { self.enc } fn len(&self) -> usize { self.bytes.len() } } #[cfg(test)] pub mod tests { use onig_sys; use super::*; #[test] pub fn rust_string_encoding_is_utf8() { let foo = "foo"; assert_eq!( unsafe { &onig_sys::OnigEncodingUTF8 } as onig_sys::OnigEncoding, foo.encoding() ); let bar = String::from(".*"); assert_eq!( unsafe { &onig_sys::OnigEncodingUTF8 } as onig_sys::OnigEncoding, bar.encoding() ); } #[test] pub fn rust_bytes_encoding_is_ascii() { let fizz = b"fizz"; let buff = EncodedBytes::ascii(fizz); assert_eq!( unsafe { &onig_sys::OnigEncodingASCII } as onig_sys::OnigEncoding, buff.encoding() ); } #[test] pub fn rust_string_ptr_offsets_are_valid() { let test_string = "hello world"; assert_eq!( test_string.limit_ptr() as usize - test_string.start_ptr() as usize, test_string.len() ); } #[test] pub fn rust_bytes_ptr_offsets_are_valid() { let fozz = b"foo.*bar"; let buff = EncodedBytes::ascii(fozz); assert_eq!( buff.limit_ptr() as usize - buff.start_ptr() as usize, fozz.len() ); } #[test] pub fn byte_buffer_create() { let buff = b"hello world"; let enc_buffer = EncodedBytes::from_parts(buff, unsafe { &onig_sys::OnigEncodingASCII }); assert_eq!( unsafe { &onig_sys::OnigEncodingASCII } as onig_sys::OnigEncoding, enc_buffer.encoding() ); assert_eq!( enc_buffer.limit_ptr() as usize - enc_buffer.start_ptr() as usize, buff.len() ); } } onig-4.3.2/src/find.rs010064400007650000024000000431621342757547600130020ustar0000000000000000use std::iter::Iterator; use super::{Regex, Region, SearchOptions}; impl Regex { /// Returns the capture groups corresponding to the leftmost-first match /// in text. Capture group `0` always corresponds to the entire match. /// If no match is found, then `None` is returned. pub fn captures<'t>(&self, text: &'t str) -> Option> { let mut region = Region::new(); self.search_with_options( text, 0, text.len(), SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), ).map(|pos| Captures { text, region, offset: pos, }) } /// Returns an iterator for each successive non-overlapping match in `text`, /// returning the start and end byte indices with respect to `text`. /// /// # Example /// /// Find the start and end location of every word with exactly 13 /// characters: /// /// ```rust /// # extern crate onig; use onig::Regex; /// # fn main() { /// let text = "Retroactively relinquishing remunerations is reprehensible."; /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { /// println!("{:?}", pos); /// } /// // Output: /// // (0, 13) /// // (14, 27) /// // (28, 41) /// // (45, 58) /// # } /// ``` pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { FindMatches { regex: self, region: Region::new(), text, last_end: 0, skip_next_empty: false, } } /// Returns an iterator over all the non-overlapping capture groups matched /// in `text`. This is operationally the same as `find_iter` (except it /// yields information about submatches). /// /// # Example /// /// We can use this to find all movie titles and their release years in /// some text, where the movie is formatted like "'Title' (xxxx)": /// /// ```rust /// # extern crate onig; use onig::Regex; /// # fn main() { /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)") /// .unwrap(); /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; /// for caps in re.captures_iter(text) { /// println!("Movie: {:?}, Released: {:?}", caps.at(1), caps.at(2)); /// } /// // Output: /// // Movie: Citizen Kane, Released: 1941 /// // Movie: The Wizard of Oz, Released: 1939 /// // Movie: M, Released: 1931 /// # } /// ``` pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> FindCaptures<'r, 't> { FindCaptures { regex: self, text, last_end: 0, skip_next_empty: false, } } /// Returns an iterator of substrings of `text` delimited by a match /// of the regular expression. /// Namely, each element of the iterator corresponds to text that *isn't* /// matched by the regular expression. /// /// This method will *not* copy the text given. /// /// # Example /// /// To split a string delimited by arbitrary amounts of spaces or tabs: /// /// ```rust /// # extern crate onig; use onig::Regex; /// # fn main() { /// let re = Regex::new(r"[ \t]+").unwrap(); /// let fields: Vec<&str> = re.split("a b \t c\td e").collect(); /// assert_eq!(fields, vec!("a", "b", "c", "d", "e")); /// # } /// ``` pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> { RegexSplits { finder: self.find_iter(text), last: 0, } } /// Returns an iterator of at most `limit` substrings of `text` delimited /// by a match of the regular expression. (A `limit` of `0` will return no /// substrings.) /// Namely, each element of the iterator corresponds to text that *isn't* /// matched by the regular expression. /// The remainder of the string that is not split will be the last element /// in the iterator. /// /// This method will *not* copy the text given. /// /// # Example /// /// Get the first two words in some text: /// /// ```rust /// # extern crate onig; use onig::Regex; /// # fn main() { /// let re = Regex::new(r"\W+").unwrap(); /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect(); /// assert_eq!(fields, vec!("Hey", "How", "are you?")); /// # } /// ``` pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) -> RegexSplitsN<'r, 't> { RegexSplitsN { splits: self.split(text), n: limit, } } /// Scan the given slice, capturing into the given region and /// executing a callback for each match. pub fn scan_with_region( &self, to_search: &str, region: &mut Region, options: SearchOptions, mut callback: F, ) -> i32 where F: Fn(i32, i32, &Region) -> bool, { use onig_sys::{onig_scan, OnigRegion}; use std::os::raw::{c_int, c_void}; // Find the bounds of the string we're searching let start = to_search.as_ptr(); let end = to_search[to_search.len()..].as_ptr(); extern "C" fn scan_cb(i: c_int, j: c_int, r: *const OnigRegion, ud: *mut c_void) -> c_int where F: Fn(i32, i32, &Region) -> bool, { let region = Region::clone_from_raw(r); let callback = unsafe { &*(ud as *mut F) }; if callback(i, j, ®ion) { 0 } else { -1 } } unsafe { onig_scan( self.raw, start, end, (&mut region.raw) as *mut ::onig_sys::OnigRegion, options.bits(), scan_cb::, &mut callback as *mut F as *mut c_void, ) } } /// Scan a Pattern and Observe Captures /// /// The scan function takes a haystack `to_search` and invokes the /// given `callback` for each capture of this expression. pub fn scan<'t, CB>(&self, to_search: &'t str, callback: CB) where CB: Fn(i32, Captures<'t>) -> bool, { let mut region = Region::new(); self.scan_with_region( to_search, &mut region, SearchOptions::SEARCH_OPTION_NONE, |n, s, region| { let captures = Captures { text: to_search, region: region.clone(), offset: s as usize, }; callback(n, captures) }, ); } } /// Captures represents a group of captured strings for a single match. /// /// The 0th capture always corresponds to the entire match. Each subsequent /// index corresponds to the next capture group in the regex. Positions /// returned from a capture group are always byte indices. /// /// `'t` is the lifetime of the matched text. #[derive(Debug)] pub struct Captures<'t> { text: &'t str, region: Region, offset: usize, } impl<'t> Captures<'t> { /// Returns the start and end positions of the Nth capture group. Returns /// `None` if i is not a valid capture group or if the capture group did /// not match anything. The positions returned are always byte indices with /// respect to the original string matched. pub fn pos(&self, pos: usize) -> Option<(usize, usize)> { self.region.pos(pos) } /// Returns the matched string for the capture group `i`. If `i` isn't /// a valid capture group or didn't match anything, then `None` is returned. pub fn at(&self, pos: usize) -> Option<&'t str> { self.pos(pos).map(|(beg, end)| &self.text[beg..end]) } /// Returns the number of captured groups. pub fn len(&self) -> usize { self.region.len() } /// Returns true if and only if there are no captured groups. pub fn is_empty(&self) -> bool { self.len() == 0 } /// Creates an iterator of all the capture groups in order of appearance in /// the regular expression. pub fn iter(&'t self) -> SubCaptures<'t> { SubCaptures { idx: 0, caps: self } } /// Creates an iterator of all the capture group positions in order of /// appearance in the regular expression. Positions are byte indices in /// terms of the original string matched. pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { SubCapturesPos { idx: 0, caps: self } } /// Offset of the captures within the given string slice. pub fn offset(&self) -> usize { self.offset } } /// An iterator over capture groups for a particular match of a regular /// expression. /// /// `'t` is the lifetime of the matched text. pub struct SubCaptures<'t> { idx: usize, caps: &'t Captures<'t>, } impl<'t> Iterator for SubCaptures<'t> { type Item = Option<&'t str>; fn next(&mut self) -> Option> { if self.idx < self.caps.len() { self.idx += 1; Some(self.caps.at(self.idx - 1)) } else { None } } fn size_hint(&self) -> (usize, Option) { let size = self.caps.len(); (size, Some(size)) } } /// An iterator over capture group positions for a particular match of /// a regular expression. /// /// Positions are byte indices in terms of the original /// string matched. `'t` is the lifetime of the matched text. pub struct SubCapturesPos<'t> { idx: usize, caps: &'t Captures<'t>, } impl<'t> Iterator for SubCapturesPos<'t> { type Item = Option<(usize, usize)>; fn next(&mut self) -> Option> { if self.idx < self.caps.len() { self.idx += 1; Some(self.caps.pos(self.idx - 1)) } else { None } } fn size_hint(&self) -> (usize, Option) { let size = self.caps.len(); (size, Some(size)) } } /// An iterator over all non-overlapping matches for a particular string. /// /// The iterator yields a tuple of integers corresponding to the start and end /// of the match. The indices are byte offsets. The iterator stops when no more /// matches can be found. /// /// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime /// of the matched string. pub struct FindMatches<'r, 't> { regex: &'r Regex, region: Region, text: &'t str, last_end: usize, skip_next_empty: bool, } impl<'r, 't> Iterator for FindMatches<'r, 't> { type Item = (usize, usize); fn next(&mut self) -> Option<(usize, usize)> { if self.last_end > self.text.len() { return None; } self.region.clear(); self.regex.search_with_options( self.text, self.last_end, self.text.len(), SearchOptions::SEARCH_OPTION_NONE, Some(&mut self.region), )?; let (s, e) = self.region.pos(0).unwrap(); self.last_end = e; // Don't accept empty matches immediately following a match. // i.e., no infinite loops please. if e == s { self.last_end += self.text[self.last_end..] .chars() .next() .map(|c| c.len_utf8()) .unwrap_or(1); if self.skip_next_empty { self.skip_next_empty = false; return self.next(); } } else { self.skip_next_empty = true; } Some((s, e)) } } /// An iterator that yields all non-overlapping capture groups matching a /// particular regular expression. /// /// The iterator stops when no more matches can be found. /// /// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime /// of the matched string. pub struct FindCaptures<'r, 't> { regex: &'r Regex, text: &'t str, last_end: usize, skip_next_empty: bool, } impl<'r, 't> Iterator for FindCaptures<'r, 't> { type Item = Captures<'t>; fn next(&mut self) -> Option> { if self.last_end > self.text.len() { return None; } let mut region = Region::new(); let r = self.regex.search_with_options( self.text, self.last_end, self.text.len(), SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), )?; let (s, e) = region.pos(0).unwrap(); // Don't accept empty matches immediately following a match. // i.e., no infinite loops please. if e == s { self.last_end += self.text[self.last_end..] .chars() .next() .map(|c| c.len_utf8()) .unwrap_or(1); if self.skip_next_empty { self.skip_next_empty = false; return self.next(); } } else { self.last_end = e; self.skip_next_empty = true; } Some(Captures { text: self.text, region, offset: r, }) } } /// Yields all substrings delimited by a regular expression match. /// /// `'r` is the lifetime of the compiled expression and `'t` is the lifetime /// of the string being split. pub struct RegexSplits<'r, 't> { finder: FindMatches<'r, 't>, last: usize, } impl<'r, 't> Iterator for RegexSplits<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { let text = self.finder.text; match self.finder.next() { None => { if self.last >= text.len() { None } else { let s = &text[self.last..]; self.last = text.len(); Some(s) } } Some((s, e)) => { let matched = &text[self.last..s]; self.last = e; Some(matched) } } } } /// Yields at most `N` substrings delimited by a regular expression match. /// /// The last substring will be whatever remains after splitting. /// /// `'r` is the lifetime of the compiled expression and `'t` is the lifetime /// of the string being split. pub struct RegexSplitsN<'r, 't> { splits: RegexSplits<'r, 't>, n: usize, } impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { type Item = &'t str; fn next(&mut self) -> Option<&'t str> { if self.n == 0 { return None; } self.n -= 1; if self.n == 0 { let text = self.splits.finder.text; Some(&text[self.splits.last..]) } else { self.splits.next() } } fn size_hint(&self) -> (usize, Option) { (0, Some(self.n)) } } #[cfg(test)] mod tests { use super::super::*; #[test] fn test_regex_captures() { let regex = Regex::new("e(l+)|(r+)").unwrap(); let captures = regex.captures("hello").unwrap(); assert_eq!(captures.len(), 3); assert_eq!(captures.is_empty(), false); let pos1 = captures.pos(0).unwrap(); let pos2 = captures.pos(1).unwrap(); let pos3 = captures.pos(2); assert_eq!(pos1, (1, 4)); assert_eq!(pos2, (2, 4)); assert_eq!(pos3, None); let str1 = captures.at(0).unwrap(); let str2 = captures.at(1).unwrap(); let str3 = captures.at(2); assert_eq!(str1, "ell"); assert_eq!(str2, "ll"); assert_eq!(str3, None); } #[test] fn test_regex_subcaptures() { let regex = Regex::new("e(l+)").unwrap(); let captures = regex.captures("hello").unwrap(); let caps = captures.iter().collect::>(); assert_eq!(caps[0], Some("ell")); assert_eq!(caps[1], Some("ll")); assert_eq!(caps.len(), 2); } #[test] fn test_regex_subcapturespos() { let regex = Regex::new("e(l+)").unwrap(); let captures = regex.captures("hello").unwrap(); let caps = captures.iter_pos().collect::>(); assert_eq!(caps[0], Some((1, 4))); assert_eq!(caps[1], Some((2, 4))); assert_eq!(caps.len(), 2); } #[test] fn test_find_iter() { let re = Regex::new(r"\d+").unwrap(); let ms = re.find_iter("a12b2").collect::>(); assert_eq!(ms, vec![(1, 3), (4, 5)]); } #[test] fn test_find_iter_one_zero_length() { let re = Regex::new(r"\d*").unwrap(); let ms = re.find_iter("a1b2").collect::>(); assert_eq!(ms, vec![(0, 0), (1, 2), (3, 4)]); } #[test] fn test_find_iter_many_zero_length() { let re = Regex::new(r"\d*").unwrap(); let ms = re.find_iter("a1bbb2").collect::>(); assert_eq!(ms, vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)]); } #[test] fn test_zero_length_matches_jumps_past_match_location() { let re = Regex::new(r"\b").unwrap(); let matches = re.find_iter("test string").collect::>(); assert_eq!(matches, [(0, 0), (4, 4), (5, 5), (11, 11)]); } #[test] fn test_captures_iter() { let re = Regex::new(r"\d+").unwrap(); let ms = re.captures_iter("a12b2").collect::>(); assert_eq!(ms[0].pos(0).unwrap(), (1, 3)); assert_eq!(ms[1].pos(0).unwrap(), (4, 5)); } #[test] fn test_captures_stores_match_offset() { let reg = Regex::new(r"\d+\.(\d+)").unwrap(); let captures = reg.captures("100 - 3.1415 / 2.0").unwrap(); assert_eq!(6, captures.offset()); let all_caps = reg.captures_iter("1 - 3234.3 * 123.2 - 100") .map(|cap| cap.offset()) .collect::>(); assert_eq!(vec![4, 13], all_caps); } } onig-4.3.2/src/flags.rs010064400007650000024000000302331341641542700131340ustar0000000000000000use onig_sys; use std::os::raw as libc; bitflags! { /// Regex parsing and compilation options. pub struct RegexOptions: onig_sys::OnigOptionType { /// Default options. const REGEX_OPTION_NONE = onig_sys::ONIG_OPTION_NONE; /// Ambiguity match on. const REGEX_OPTION_IGNORECASE = onig_sys::ONIG_OPTION_IGNORECASE; /// Extended pattern form. const REGEX_OPTION_EXTEND = onig_sys::ONIG_OPTION_EXTEND; /// `'.'` match with newline. const REGEX_OPTION_MULTILINE = onig_sys::ONIG_OPTION_MULTILINE; /// `'^'` -> `'\A'`, `'$'` -> `'\Z'`. const REGEX_OPTION_SINGLELINE = onig_sys::ONIG_OPTION_SINGLELINE; /// Find longest match. const REGEX_OPTION_FIND_LONGEST = onig_sys::ONIG_OPTION_FIND_LONGEST; /// Ignore empty match. const REGEX_OPTION_FIND_NOT_EMPTY = onig_sys::ONIG_OPTION_FIND_NOT_EMPTY; /// Clear `OPTION_SINGLELINE` which is enabled on /// `SYNTAX_POSIX_BASIC`, `SYNTAX_POSIX_EXTENDED`, /// `SYNTAX_PERL`, `SYNTAX_PERL_NG`, `SYNTAX_JAVA`. const REGEX_OPTION_NEGATE_SINGLELINE = onig_sys::ONIG_OPTION_NEGATE_SINGLELINE; /// Only named group captured. const REGEX_OPTION_DONT_CAPTURE_GROUP = onig_sys::ONIG_OPTION_DONT_CAPTURE_GROUP; /// Named and no-named group captured. const REGEX_OPTION_CAPTURE_GROUP = onig_sys::ONIG_OPTION_CAPTURE_GROUP; } } bitflags! { /// Regex evaluation options. pub struct SearchOptions: onig_sys::OnigOptionType { /// Default options. const SEARCH_OPTION_NONE = onig_sys::ONIG_OPTION_NONE; /// String head isn't considered as begin of line. const SEARCH_OPTION_NOTBOL = onig_sys::ONIG_OPTION_NOTBOL; /// String end isn't considered as end of line. const SEARCH_OPTION_NOTEOL = onig_sys::ONIG_OPTION_NOTEOL; } } bitflags! { /// Defines the different operators allowed within a regex syntax. pub struct SyntaxOperator: u64 { /// `.` const SYNTAX_OPERATOR_DOT_ANYCHAR = (onig_sys::ONIG_SYN_OP_DOT_ANYCHAR as u64); /// `*` const SYNTAX_OPERATOR_ASTERISK_ZERO_INF = (onig_sys::ONIG_SYN_OP_ASTERISK_ZERO_INF as u64); /// `+` const SYNTAX_OPERATOR_PLUS_ONE_INF = (onig_sys::ONIG_SYN_OP_PLUS_ONE_INF as u64); /// `?` const SYNTAX_OPERATOR_QMARK_ZERO_ONE = (onig_sys::ONIG_SYN_OP_QMARK_ZERO_ONE as u64); /// `{lower,upper}` const SYNTAX_OPERATOR_BRACE_INTERVAL = (onig_sys::ONIG_SYN_OP_BRACE_INTERVAL as u64); /// `\{lower,upper\}` const SYNTAX_OPERATOR_ESC_BRACE_INTERVAL = (onig_sys::ONIG_SYN_OP_ESC_BRACE_INTERVAL as u64); /// `|` const SYNTAX_OPERATOR_VBAR_ALT = (onig_sys::ONIG_SYN_OP_VBAR_ALT as u64); /// `\|` const SYNTAX_OPERATOR_ESC_VBAR_ALT = (onig_sys::ONIG_SYN_OP_ESC_VBAR_ALT as u64); /// `(...)` const SYNTAX_OPERATOR_LPAREN_SUBEXP = (onig_sys::ONIG_SYN_OP_LPAREN_SUBEXP as u64); /// `\(...\)` const SYNTAX_OPERATOR_ESC_LPAREN_SUBEXP = (onig_sys::ONIG_SYN_OP_ESC_LPAREN_SUBEXP as u64); /// `\A, \Z, \z` const SYNTAX_OPERATOR_ESC_AZ_BUF_ANCHOR = (onig_sys::ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR as u64); /// `\G` const SYNTAX_OPERATOR_ESC_CAPITAL_G_BEGIN_ANCHOR = (onig_sys::ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR as u64); /// `\num` const SYNTAX_OPERATOR_DECIMAL_BACKREF = (onig_sys::ONIG_SYN_OP_DECIMAL_BACKREF as u64); /// `[...]` const SYNTAX_OPERATOR_BRACKET_CC = (onig_sys::ONIG_SYN_OP_BRACKET_CC as u64); /// `\w, \W` const SYNTAX_OPERATOR_ESC_W_WORD = (onig_sys::ONIG_SYN_OP_ESC_W_WORD as u64); /// `\<. \>` const SYNTAX_OPERATOR_ESC_LTGT_WORD_BEGIN_END = (onig_sys::ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END as u64); /// `\b, \B` const SYNTAX_OPERATOR_ESC_B_WORD_BOUND = (onig_sys::ONIG_SYN_OP_ESC_B_WORD_BOUND as u64); /// `\s, \S` const SYNTAX_OPERATOR_ESC_S_WHITE_SPACE = (onig_sys::ONIG_SYN_OP_ESC_S_WHITE_SPACE as u64); /// `\d, \D` const SYNTAX_OPERATOR_ESC_D_DIGIT = (onig_sys::ONIG_SYN_OP_ESC_D_DIGIT as u64); /// `^, $` const SYNTAX_OPERATOR_LINE_ANCHOR = (onig_sys::ONIG_SYN_OP_LINE_ANCHOR as u64); /// `[:xxxx:]` const SYNTAX_OPERATOR_POSIX_BRACKET = (onig_sys::ONIG_SYN_OP_POSIX_BRACKET as u64); /// `??,*?,+?,{n,m}?` const SYNTAX_OPERATOR_QMARK_NON_GREEDY = (onig_sys::ONIG_SYN_OP_QMARK_NON_GREEDY as u64); /// `\n,\r,\t,\a ...` const SYNTAX_OPERATOR_ESC_CONTROL_CHARS = (onig_sys::ONIG_SYN_OP_ESC_CONTROL_CHARS as u64); /// `\cx` const SYNTAX_OPERATOR_ESC_C_CONTROL = (onig_sys::ONIG_SYN_OP_ESC_C_CONTROL as u64); /// `\OOO` const SYNTAX_OPERATOR_ESC_OCTAL3 = (onig_sys::ONIG_SYN_OP_ESC_OCTAL3 as u64); /// `\xHH` const SYNTAX_OPERATOR_ESC_X_HEX2 = (onig_sys::ONIG_SYN_OP_ESC_X_HEX2 as u64); /// `\x{7HHHHHHH}` const SYNTAX_OPERATOR_ESC_X_BRACE_HEX8 = (onig_sys::ONIG_SYN_OP_ESC_X_BRACE_HEX8 as u64); /// Variable meta characters const SYNTAX_OPERATOR_VARIABLE_META_CHARACTERS = (onig_sys::ONIG_SYN_OP_VARIABLE_META_CHARACTERS as u64); /// `\Q...\E` const SYNTAX_OPERATOR_ESC_CAPITAL_Q_QUOTE = (onig_sys::ONIG_SYN_OP2_ESC_CAPITAL_Q_QUOTE as u64) << 32; /// `(?...)` const SYNTAX_OPERATOR_QMARK_GROUP_EFFECT = (onig_sys::ONIG_SYN_OP2_QMARK_GROUP_EFFECT as u64) << 32; /// `(?imsx),(?-imsx)` const SYNTAX_OPERATOR_OPTION_PERL = (onig_sys::ONIG_SYN_OP2_OPTION_PERL as u64) << 32; /// `(?imx), (?-imx)` const SYNTAX_OPERATOR_OPTION_RUBY = (onig_sys::ONIG_SYN_OP2_OPTION_RUBY as u64) << 32; /// `?+,*+,++` const SYNTAX_OPERATOR_PLUS_POSSESSIVE_REPEAT = (onig_sys::ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT as u64) << 32; /// `{n,m}+` const SYNTAX_OPERATOR_PLUS_POSSESSIVE_INTERVAL = (onig_sys::ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL as u64) << 32; /// `[...&&..[..]..]` const SYNTAX_OPERATOR_CCLASS_SET_OP = (onig_sys::ONIG_SYN_OP2_CCLASS_SET_OP as u64) << 32; /// `(?...)` const SYNTAX_OPERATOR_QMARK_LT_NAMED_GROUP = (onig_sys::ONIG_SYN_OP2_QMARK_LT_NAMED_GROUP as u64) << 32; /// `\k` const SYNTAX_OPERATOR_ESC_K_NAMED_BACKREF = (onig_sys::ONIG_SYN_OP2_ESC_K_NAMED_BACKREF as u64) << 32; /// `\g, \g` const SYNTAX_OPERATOR_ESC_G_SUBEXP_CALL = (onig_sys::ONIG_SYN_OP2_ESC_G_SUBEXP_CALL as u64) << 32; /// `(?@..),(?@..)` const SYNTAX_OPERATOR_ATMARK_CAPTURE_HISTORY = (onig_sys::ONIG_SYN_OP2_ATMARK_CAPTURE_HISTORY as u64) << 32; /// `\C-x` const SYNTAX_OPERATOR_ESC_CAPITAL_C_BAR_CONTROL = (onig_sys::ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL as u64) << 32; /// `\M-x` const SYNTAX_OPERATOR_ESC_CAPITAL_M_BAR_META = (onig_sys::ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META as u64) << 32; /// `\v as VTAB` const SYNTAX_OPERATOR_ESC_V_VTAB = (onig_sys::ONIG_SYN_OP2_ESC_V_VTAB as u64) << 32; /// `\uHHHH` const SYNTAX_OPERATOR_ESC_U_HEX4 = (onig_sys::ONIG_SYN_OP2_ESC_U_HEX4 as u64) << 32; /// `\`, \'` const SYNTAX_OPERATOR_ESC_GNU_BUF_ANCHOR = (onig_sys::ONIG_SYN_OP2_ESC_GNU_BUF_ANCHOR as u64) << 32; /// `\p{...}, \P{...}` const SYNTAX_OPERATOR_ESC_P_BRACE_CHAR_PROPERTY = (onig_sys::ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY as u64) << 32; /// `\p{^..}, \P{^..}` const SYNTAX_OPERATOR_ESC_P_BRACE_CIRCUMFLEX_NOT = (onig_sys::ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT as u64) << 32; /// `\h, \H` const SYNTAX_OPERATOR_ESC_H_XDIGIT = (onig_sys::ONIG_SYN_OP2_ESC_H_XDIGIT as u64) << 32; /// `\` const SYNTAX_OPERATOR_INEFFECTIVE_ESCAPE = (onig_sys::ONIG_SYN_OP2_INEFFECTIVE_ESCAPE as u64) << 32; } } bitflags! { /// Defines the behaviour of regex operators. pub struct SyntaxBehavior: onig_sys::OnigSyntaxBehavior { /// `?, *, +, {n,m}` const SYNTAX_BEHAVIOR_CONTEXT_INDEP_REPEAT_OPS = onig_sys::ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS; /// `error or ignore` const SYNTAX_BEHAVIOR_CONTEXT_INVALID_REPEAT_OPS = onig_sys::ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS; /// `...)...` const SYNTAX_BEHAVIOR_ALLOW_UNMATCHED_CLOSE_SUBEXP = onig_sys::ONIG_SYN_ALLOW_UNMATCHED_CLOSE_SUBEXP; /// `{???` const SYNTAX_BEHAVIOR_ALLOW_INVALID_INTERVAL = onig_sys::ONIG_SYN_ALLOW_INVALID_INTERVAL; /// `{,n} => {0,n}` const SYNTAX_BEHAVIOR_ALLOW_INTERVAL_LOW_ABBREV = onig_sys::ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV; /// `/(\1)/,/\1()/ ..` const SYNTAX_BEHAVIOR_STRICT_CHECK_BACKREF = onig_sys::ONIG_SYN_STRICT_CHECK_BACKREF; /// `(?<=a|bc)` const SYNTAX_BEHAVIOR_DIFFERENT_LEN_ALT_LOOK_BEHIND = onig_sys::ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND; /// See Oniguruma documenation const SYNTAX_BEHAVIOR_CAPTURE_ONLY_NAMED_GROUP = onig_sys::ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP; /// `(?)(?)` const SYNTAX_BEHAVIOR_ALLOW_MULTIPLEX_DEFINITION_NAME = onig_sys::ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME; /// `a{n}?=(?:a{n})?` const SYNTAX_BEHAVIOR_FIXED_INTERVAL_IS_GREEDY_ONLY = onig_sys::ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY; /// `[^...]` const SYNTAX_BEHAVIOR_NOT_NEWLINE_IN_NEGATIVE_CC = onig_sys::ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC; /// `[..\w..] etc..` const SYNTAX_BEHAVIOR_BACKSLASH_ESCAPE_IN_CC = onig_sys::ONIG_SYN_BACKSLASH_ESCAPE_IN_CC; /// `[0-9-a]=[0-9\-a]` const SYNTAX_BEHAVIOR_ALLOW_DOUBLE_RANGE_OP_IN_CC = onig_sys::ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC; /// `[,-,]` const SYNTAX_BEHAVIOR_WARN_CC_OP_NOT_ESCAPED = onig_sys::ONIG_SYN_WARN_CC_OP_NOT_ESCAPED; /// `(?:a*)+` const SYNTAX_BEHAVIOR_WARN_REDUNDANT_NESTED_REPEAT = onig_sys::ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT; } } bitflags! { /// The order in which traverse callbacks are invoked pub struct TraverseCallbackAt: libc::c_int { /// Callback before children are wallked const CALLBACK_AT_FIRST = onig_sys::ONIG_TRAVERSE_CALLBACK_AT_FIRST; /// Callback after children are walked const CALLBACK_AT_LAST = onig_sys::ONIG_TRAVERSE_CALLBACK_AT_LAST; /// Callback both before and after children are walked. const CALLBACK_AT_BOTH = onig_sys::ONIG_TRAVERSE_CALLBACK_AT_BOTH; } } bitflags! { /// Syntax meta character types pub struct MetaCharType: libc::c_uint { /// The escape charater for this syntax const META_CHAR_ESCAPE = onig_sys::ONIG_META_CHAR_ESCAPE; /// The any (.) character for this syntax. const META_CHAR_ANYCHAR = onig_sys::ONIG_META_CHAR_ANYCHAR; /// The any number of repeats (*) character for this syntax. const META_CHAR_ANYTIME = onig_sys::ONIG_META_CHAR_ANYTIME; /// The optinoal (?) chracter for this syntax const META_CHAR_ZERO_OR_ONE_TIME = onig_sys::ONIG_META_CHAR_ZERO_OR_ONE_TIME; /// The at least once (+) character for this syntax const META_CHAR_ONE_OR_MORE_TIME = onig_sys::ONIG_META_CHAR_ONE_OR_MORE_TIME; /// The glob character for this syntax (.*) const META_CHAR_ANYCHAR_ANYTIME = onig_sys::ONIG_META_CHAR_ANYCHAR_ANYTIME; } } onig-4.3.2/src/lib.rs010064400007650000024000000754011342757626100126220ustar0000000000000000//! This crate provides a safe wrapper around the //! [Oniguruma](https://github.com/kkos/oniguruma) regular expression library. //! //! # Examples //! //! ```rust //! use onig::Regex; //! //! let regex = Regex::new("e(l+)").unwrap(); //! for (i, pos) in regex.captures("hello").unwrap().iter_pos().enumerate() { //! match pos { //! Some((beg, end)) => //! println!("Group {} captured in position {}:{}", i, beg, end), //! None => //! println!("Group {} is not captured", i) //! } //! } //! ``` //! //! # Match vs Search //! //! There are two basic things you can do with a `Regex` pattern; test //! if the pattern matches the whole of a given string, and search for //! occurences of the pattern within a string. Oniguruma exposes these //! two concepts with the *match* and *search* APIs. //! //! In addition two these two base Onigurma APIs this crate exposes a //! third *find* API, built on top of the *search* API. //! //! ``` //! # use onig::Regex; //! let pattern = Regex::new("hello").unwrap(); //! assert_eq!(true, pattern.find("hello world").is_some()); //! assert_eq!(false, pattern.is_match("hello world")); //! ``` //! //! ## The *Match* API //! //! Functions in the match API check if a pattern matches the entire //! string. The simplest of these is `Regex::is_match`. This retuns a //! `true` if the pattern matches the string. For more complex useage //! then `Regex::match_with_options` and `Regex::match_with_encoding` //! can be used. These allow the capture groups to be inspected, //! matching with different options, and matching sub-sections of a //! given text. //! //! ## The *Search* API //! //! Function in the search API search for a pattern anywhere within a //! string. The simplist of these is `Regex::find`. This returns the //! offset of the first occurence of the pattern within the string. //! For more complex useage `Regex::search_with_options` and //! `Regex::search_with_encoding` can be used. These allow capture //! groups to be inspected, searching with different options and //! searching within subsections of a given text. //! //! ## The *Find* API //! //! The find API is built on top of the search API. Functions in this //! API allow iteration across all matches of the pattern within a //! string, not just the first one. The functions deal with some of //! the complexities of this, such as zero-length matches. //! //! The simplest step-up from the basic search API `Regex::find` is //! getting the captures relating to a match with the //! `Regex::capturess` method. To find capture information for all //! matches within a string `Regex::find_iter` and //! `Regex::captures_iter` can be used. The former exposes the start //! and end of the match as `Regex::find` does, the latter exposes the //! whole capture group information as `Regex::captures` does. //! //! # The `std::pattern` API //! //! In addition to the main Oniguruma API it is possible to use the //! `Regex` object with the //! [`std::pattern`](https://doc.rust-lang.org/std/str/pattern/) //! API. To enable support compile with the `std-pattern` feature. If //! you're using Cargo you can do this by adding the following to your //! Cargo.toml: //! //! ```toml //! [dependencies.onig] //! version = "1.2" //! features = ["std-pattern"] //! ``` #![cfg_attr(not(feature = "cargo-clippy"), allow(unknown_lints))] #![cfg_attr(feature = "std-pattern", feature(pattern))] #![deny(missing_docs)] #[macro_use] extern crate bitflags; #[macro_use] extern crate lazy_static; extern crate onig_sys; #[cfg(windows)] extern crate libc; mod find; mod flags; mod region; mod replace; mod match_param; mod names; mod syntax; mod tree; mod utils; mod buffers; #[cfg(feature = "std-pattern")] mod pattern; // re-export the onig types publically pub use flags::*; pub use match_param::MatchParam; pub use names::CaptureNames; pub use region::Region; pub use find::{Captures, FindCaptures, FindMatches, RegexSplits, RegexSplitsN, SubCaptures, SubCapturesPos}; pub use buffers::{EncodedBytes, EncodedChars}; pub use replace::Replacer; pub use tree::{CaptureTreeNode, CaptureTreeNodeIter}; pub use syntax::{MetaChar, Syntax}; pub use utils::{copyright, define_user_property, version}; use std::{error, fmt, str}; use std::sync::Mutex; use std::ptr::{null, null_mut}; use std::os::raw::c_int; /// This struture represents an error from the underlying Oniguruma libray. pub struct Error { code: c_int, description: String, } /// This struct is a wrapper around an Oniguruma regular expression /// pointer. This represents a compiled regex which can be used in /// search and match operations. #[derive(Debug, Eq, PartialEq)] pub struct Regex { raw: onig_sys::OnigRegexMut, } unsafe impl Send for Regex {} unsafe impl Sync for Regex {} impl Error { fn from_code_and_info(code: c_int, info: &onig_sys::OnigErrorInfo) -> Error { Error::new(code, info) } fn from_code(code: c_int) -> Error { Error::new(code, null()) } fn new(code: c_int, info: *const onig_sys::OnigErrorInfo) -> Error { let buff = &mut [0; onig_sys::ONIG_MAX_ERROR_MESSAGE_LEN as usize]; let len = unsafe { onig_sys::onig_error_code_to_str(buff.as_mut_ptr(), code, info) }; let description = str::from_utf8(&buff[..len as usize]).unwrap(); Error { code, description: description.to_owned(), } } /// Return Oniguruma engine error code. pub fn code(&self) -> i32 { self.code } /// Return error description provided by Oniguruma engine. pub fn description(&self) -> &str { &self.description } } impl error::Error for Error { fn description(&self) -> &str { &self.description } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Oniguruma error: {}", self.description()) } } impl fmt::Debug for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Error({}, {})", self.code, self.description()) } } lazy_static! { static ref REGEX_NEW_MUTEX: Mutex<()> = Mutex::new(()); } impl Regex { /// Create a Regex /// /// Simple regular expression constructor. Compiles a new regular /// expression with the default options using the ruby syntax. /// Once compiled, it can be used repeatedly to search in a string. If an /// invalid expression is given, then an error is returned. /// /// # Arguments /// /// * `pattern` - The regex pattern to compile /// /// # Examples /// /// ``` /// use onig::Regex; /// let r = Regex::new(r#"hello (\w+)"#); /// assert!(r.is_ok()); /// ``` pub fn new(pattern: &str) -> Result { Regex::with_encoding(pattern) } /// Create a Regex, Specifying an Encoding /// /// Attempts to compile `pattern` into a new `Regex` /// instance. Instead of assuming UTF-8 as the encoding scheme the /// encoding is inferred from the `pattern` buffer. /// /// # Arguments /// /// * `pattern` - The regex pattern to compile /// /// # Examples /// /// ``` /// use onig::{Regex, EncodedBytes}; /// let utf8 = Regex::with_encoding("hello"); /// assert!(utf8.is_ok()); /// let ascii = Regex::with_encoding(EncodedBytes::ascii(b"world")); /// assert!(ascii.is_ok()); /// ``` pub fn with_encoding(pattern: T) -> Result where T: EncodedChars, { Regex::with_options_and_encoding( pattern, RegexOptions::REGEX_OPTION_NONE, Syntax::default(), ) } /// Create a new Regex /// /// Attempts to compile a pattern into a new `Regex` instance. /// Once compiled, it can be used repeatedly to search in a string. If an /// invalid expression is given, then an error is returned. /// See [`onig_sys::onig_new`][regex_new] for more information. /// /// # Arguments /// /// * `pattern` - The regex pattern to compile. /// * `options` - The regex compilation options. /// * `syntax` - The syntax which the regex is written in. /// /// # Examples /// /// ``` /// use onig::{Regex, Syntax, RegexOptions}; /// let r = Regex::with_options("hello.*world", /// RegexOptions::REGEX_OPTION_NONE, /// Syntax::default()); /// assert!(r.is_ok()); /// ``` /// /// [regex_new]: ./onig_sys/fn.onig_new.html pub fn with_options( pattern: &str, option: RegexOptions, syntax: &Syntax, ) -> Result { Regex::with_options_and_encoding(pattern, option, syntax) } /// Create a new Regex, Specifying Options and Ecoding /// /// Attempts to comile the given `pattern` into a new `Regex` /// instance. Instead of assuming UTF-8 as the encoding scheme the /// encoding is inferred from the `pattern` buffer. If the regex /// fails to compile the returned `Error` value from /// [`onig_new`][regex_new] contains more information. /// /// [regex_new]: ./onig_sys/fn.onig_new.html /// /// # Arguments /// /// * `pattern` - The regex pattern to compile. /// * `options` - The regex compilation options. /// * `syntax` - The syntax which the regex is written in. /// /// # Examples /// ``` /// use onig::{Regex, Syntax, EncodedBytes, RegexOptions}; /// let pattern = EncodedBytes::ascii(b"hello"); /// let r = Regex::with_options_and_encoding(pattern, /// RegexOptions::REGEX_OPTION_SINGLELINE, /// Syntax::default()); /// assert!(r.is_ok()); /// ``` pub fn with_options_and_encoding( pattern: T, option: RegexOptions, syntax: &Syntax, ) -> Result where T: EncodedChars, { // Convert the rust types to those required for the call to // `onig_new`. let mut reg: onig_sys::OnigRegexMut = null_mut(); let reg_ptr = &mut reg as *mut onig_sys::OnigRegexMut; // We can use this later to get an error message to pass back // if regex creation fails. let mut error = onig_sys::OnigErrorInfo { enc: null(), par: null(), par_end: null(), }; let err = unsafe { // Grab a lock to make sure that `onig_new` isn't called by // more than one thread at a time. let _guard = REGEX_NEW_MUTEX.lock().unwrap(); onig_sys::onig_new( reg_ptr, pattern.start_ptr(), pattern.limit_ptr(), option.bits(), pattern.encoding(), syntax as *const Syntax as *const onig_sys::OnigSyntaxType, &mut error, ) }; if err == onig_sys::ONIG_NORMAL { Ok(Regex { raw: reg }) } else { Err(Error::from_code_and_info(err, &error)) } } /// Match String /// /// Try to match the regex against the given string slice, /// starting at a given offset. This method works the same way as /// `match_with_encoding`, but the encoding is always utf-8. /// /// For more information see [Match vs /// Search](index.html#match-vs-search) /// /// # Arguments /// /// * `str` - The string slice to match against. /// * `at` - The byte index in the passed slice to start matching /// * `options` - The regex match options. /// * `region` - The region for return group match range info /// /// # Returns /// /// `Some(len)` if the regex matched, with `len` being the number /// of bytes matched. `None` if the regex doesn't match. /// /// # Examples /// /// ``` /// use onig::{Regex, SearchOptions}; /// /// let r = Regex::new(".*").unwrap(); /// let res = r.match_with_options("hello", 0, SearchOptions::SEARCH_OPTION_NONE, None); /// assert!(res.is_some()); // it matches /// assert!(res.unwrap() == 5); // 5 characters matched /// ``` pub fn match_with_options( &self, str: &str, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option { self.match_with_encoding(str, at, options, region) } /// Match String with Encoding /// /// Match the regex against a string. This method will start at /// the offset `at` into the string and try and match the /// regex. If the regex matches then the return value is the /// number of characters which matched. If the regex doesn't match /// the return is `None`. /// /// For more information see [Match vs /// Search](index.html#match-vs-search) /// /// The contents of `chars` must have the same encoding that was /// used to construct the regex. /// /// # Arguments /// /// * `chars` - The buffer to match against. /// * `at` - The byte index in the passed buffer to start matching /// * `options` - The regex match options. /// * `region` - The region for return group match range info /// /// # Returns /// /// `Some(len)` if the regex matched, with `len` being the number /// of bytes matched. `None` if the regex doesn't match. /// /// # Examples /// /// ``` /// use onig::{Regex, EncodedBytes, SearchOptions}; /// /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap(); /// let res = r.match_with_encoding(EncodedBytes::ascii(b"world"), /// 0, SearchOptions::SEARCH_OPTION_NONE, None); /// assert!(res.is_some()); // it matches /// assert!(res.unwrap() == 5); // 5 characters matched /// ``` pub fn match_with_encoding( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option where T: EncodedChars, { let match_param = MatchParam::default(); let result = self.match_with_param(chars, at, options, region, match_param); match result { Ok(r) => r, Err(e) => panic!("Onig: Regex match error: {}", e.description()) } } /// Match string with encoding and match param /// /// Match the regex against a string. This method will start at /// the offset `at` into the string and try and match the /// regex. If the regex matches then the return value is the /// number of characters which matched. If the regex doesn't match /// the return is `None`. /// /// For more information see [Match vs /// Search](index.html#match-vs-search) /// /// The contents of `chars` must have the same encoding that was /// used to construct the regex. /// /// # Arguments /// /// * `chars` - The buffer to match against. /// * `at` - The byte index in the passed buffer to start matching /// * `options` - The regex match options. /// * `region` - The region for return group match range info /// * `match_param` - The match parameters /// /// # Returns /// /// `Ok(Some(len))` if the regex matched, with `len` being the number /// of bytes matched. `Ok(None)` if the regex doesn't match. `Err` with an /// `Error` if an error occurred (e.g. retry-limit-in-match exceeded). /// /// # Examples /// /// ``` /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions}; /// /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap(); /// let res = r.match_with_param(EncodedBytes::ascii(b"world"), /// 0, SearchOptions::SEARCH_OPTION_NONE, /// None, MatchParam::default()); /// assert!(res.is_ok()); // matching did not error /// assert!(res.unwrap() == Some(5)); // 5 characters matched /// ``` pub fn match_with_param( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result, Error> where T: EncodedChars, { assert_eq!(chars.encoding(), self.encoding()); let r = unsafe { let offset = chars.start_ptr().add(at); assert!(offset <= chars.limit_ptr()); onig_sys::onig_match_with_param( self.raw, chars.start_ptr(), chars.limit_ptr(), offset, match region { Some(region) => region as *mut Region as *mut onig_sys::OnigRegion, None => std::ptr::null_mut(), }, options.bits(), match_param.as_raw() ) }; if r >= 0 { Ok(Some(r as usize)) } else if r == onig_sys::ONIG_MISMATCH { Ok(None) } else { Err(Error::from_code(r)) } } /// Search pattern in string /// /// Search for matches the regex in a string. This method will return the /// index of the first match of the regex within the string, if /// there is one. If `from` is less than `to`, then search is performed /// in forward order, otherwise – in backward order. /// /// For more information see [Match vs /// Search](index.html#match-vs-search) /// /// # Arguments /// /// * `str` - The string to search in. /// * `from` - The byte index in the passed slice to start search /// * `to` - The byte index in the passed slice to finish search /// * `options` - The options for the search. /// * `region` - The region for return group match range info /// /// # Returns /// /// `Some(pos)` if the regex matches, where `pos` is the /// byte-position of the start of the match. `None` if the regex /// doesn't match anywhere in `str`. /// /// # Examples /// /// ``` /// use onig::{Regex, SearchOptions}; /// /// let r = Regex::new("l{1,2}").unwrap(); /// let res = r.search_with_options("hello", 0, 5, SearchOptions::SEARCH_OPTION_NONE, None); /// assert!(res.is_some()); // it matches /// assert!(res.unwrap() == 2); // match starts at character 3 /// ``` pub fn search_with_options( &self, str: &str, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option { self.search_with_encoding(str, from, to, options, region) } /// Search for a Pattern in a String with an Encoding /// /// Search for matches the regex in a string. This method will /// return the index of the first match of the regex within the /// string, if there is one. If `from` is less than `to`, then /// search is performed in forward order, otherwise – in backward /// order. /// /// For more information see [Match vs /// Search](index.html#match-vs-search) /// /// The encoding of the buffer passed to search in must match the /// encoding of the regex. /// /// # Arguments /// /// * `chars` - The character buffer to search in. /// * `from` - The byte index in the passed slice to start search /// * `to` - The byte index in the passed slice to finish search /// * `options` - The options for the search. /// * `region` - The region for return group match range info /// /// # Returns /// /// `Some(pos)` if the regex matches, where `pos` is the /// byte-position of the start of the match. `None` if the regex /// doesn't match anywhere in `chars`. /// /// # Examples /// /// ``` /// use onig::{Regex, EncodedBytes, SearchOptions}; /// /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap(); /// let res = r.search_with_encoding(EncodedBytes::ascii(b"hello"), /// 0, 5, SearchOptions::SEARCH_OPTION_NONE, None); /// assert!(res.is_some()); // it matches /// assert!(res.unwrap() == 2); // match starts at character 3 /// ``` pub fn search_with_encoding( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option where T: EncodedChars, { let match_param = MatchParam::default(); let result = self.search_with_param(chars, from, to, options, region, match_param); match result { Ok(r) => r, Err(e) => panic!("Onig: Regex search error: {}", e.description) } } /// Search pattern in string with encoding and match param /// /// Search for matches the regex in a string. This method will /// return the index of the first match of the regex within the /// string, if there is one. If `from` is less than `to`, then /// search is performed in forward order, otherwise – in backward /// order. /// /// For more information see [Match vs /// Search](index.html#match-vs-search) /// /// The encoding of the buffer passed to search in must match the /// encoding of the regex. /// /// # Arguments /// /// * `chars` - The character buffer to search in. /// * `from` - The byte index in the passed slice to start search /// * `to` - The byte index in the passed slice to finish search /// * `options` - The options for the search. /// * `region` - The region for return group match range info /// * `match_param` - The match parameters /// /// # Returns /// /// `Ok(Some(pos))` if the regex matches, where `pos` is the /// byte-position of the start of the match. `Ok(None)` if the regex /// doesn't match anywhere in `chars`. `Err` with an `Error` if an error /// occurred (e.g. retry-limit-in-match exceeded). /// /// # Examples /// /// ``` /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions}; /// /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap(); /// let res = r.search_with_param(EncodedBytes::ascii(b"hello"), /// 0, 5, SearchOptions::SEARCH_OPTION_NONE, /// None, MatchParam::default()); /// assert!(res.is_ok()); // matching did not error /// assert!(res.unwrap() == Some(2)); // match starts at character 3 /// ``` pub fn search_with_param( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result, Error> where T: EncodedChars, { let (beg, end) = (chars.start_ptr(), chars.limit_ptr()); assert_eq!(self.encoding(), chars.encoding()); let r = unsafe { let start = beg.add(from ); let range = beg.add(to); assert!(start <= end); assert!(range <= end); onig_sys::onig_search_with_param( self.raw, beg, end, start, range, match region { Some(region) => region as *mut Region as *mut onig_sys::OnigRegion, None => std::ptr::null_mut(), }, options.bits(), match_param.as_raw() ) }; if r >= 0 { Ok(Some(r as usize)) } else if r == onig_sys::ONIG_MISMATCH { Ok(None) } else { Err(Error::from_code(r)) } } /// Returns true if and only if the regex matches the string given. /// /// For more information see [Match vs /// Search](index.html#match-vs-search) /// /// # Arguments /// * `text` - The string slice to test against the pattern. /// /// # Returns /// /// `true` if the pattern matches the whole of `text`, `false` otherwise. pub fn is_match(&self, text: &str) -> bool { self.match_with_options(text, 0, SearchOptions::SEARCH_OPTION_NONE, None) .map(|r| r == text.len()) .unwrap_or(false) } /// Find a Match in a Buffer, With Encoding /// /// Finds the first match of the regular expression within the /// buffer. /// /// Note that this should only be used if you want to discover the /// position of the match within a string. Testing if a pattern /// matches the whole string is faster if you use `is_match`. For /// more information see [Match vs /// Search](index.html#match-vs-search) /// /// # Arguments /// * `text` - The text to search in. /// /// # Returns /// /// The offset of the start and end of the first match. If no /// match exists `None` is returned. pub fn find(&self, text: &str) -> Option<(usize, usize)> { self.find_with_encoding(text) } /// Find a Match in a Buffer, With Encoding /// /// Finds the first match of the regular expression within the /// buffer. /// /// For more information see [Match vs /// Search](index.html#match-vs-search) /// /// # Arguments /// * `text` - The text to search in. /// /// # Returns /// /// The offset of the start and end of the first match. If no /// match exists `None` is returned. pub fn find_with_encoding(&self, text: T) -> Option<(usize, usize)> where T: EncodedChars, { let mut region = Region::new(); let len = text.len(); self.search_with_encoding( text, 0, len, SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), ).and_then(|_| region.pos(0)) } /// Get the Encoding of the Regex /// /// # Returns /// /// Returns a reference to an oniguruma encoding which was used /// when this regex was created. pub fn encoding(&self) -> onig_sys::OnigEncoding { unsafe { onig_sys::onig_get_encoding(self.raw) } } /// Get the Number of Capture Groups in this Pattern pub fn captures_len(&self) -> usize { unsafe { onig_sys::onig_number_of_captures(self.raw) as usize } } /// Get the Size of the Capture Histories for this Pattern pub fn capture_histories_len(&self) -> usize { unsafe { onig_sys::onig_number_of_capture_histories(self.raw) as usize } } } impl Drop for Regex { fn drop(&mut self) { unsafe { onig_sys::onig_free(self.raw); } } } #[cfg(test)] mod tests { use super::*; use std::panic; #[test] fn test_regex_create() { Regex::with_options(".*", RegexOptions::REGEX_OPTION_NONE, Syntax::default()).unwrap(); Regex::new(r#"a \w+ word"#).unwrap(); } #[test] fn test_regex_invalid() { let e = Regex::new("\\p{foo}").unwrap_err(); assert_eq!(e.code(), -223); assert_eq!(e.description(), "invalid character property name {foo}"); } #[test] fn test_failed_match() { let regex = Regex::new("foo").unwrap(); let res = regex.match_with_options("bar", 0, SearchOptions::SEARCH_OPTION_NONE, None); assert!(res.is_none()); } #[test] fn test_regex_search_with_options() { let mut region = Region::new(); let regex = Regex::new("e(l+)").unwrap(); let r = regex.search_with_options( "hello", 0, 5, SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), ); assert!(region.tree().is_none()); assert_eq!(r, Some(1)); assert_eq!(region.len(), 2); let pos1 = region.pos(0).unwrap(); let pos2 = region.pos(1).unwrap(); assert_eq!(pos1, (1, 4)); assert_eq!(pos2, (2, 4)); // test cloning here since we already have a filled region let cloned_region = region.clone(); let pos1_clone = cloned_region.pos(0).unwrap(); assert_eq!(pos1_clone, pos1); } #[test] fn test_regex_match_with_options() { let mut region = Region::new(); let regex = Regex::new("he(l+)").unwrap(); let r = regex.match_with_options( "hello", 0, SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), ); assert!(region.tree().is_none()); assert_eq!(r, Some(4)); assert_eq!(region.len(), 2); let pos1 = region.pos(0).unwrap(); let pos2 = region.pos(1).unwrap(); assert_eq!(pos1, (0, 4)); assert_eq!(pos2, (2, 4)); } #[test] fn test_regex_is_match() { let regex = Regex::new("he(l+)o").unwrap(); assert!(regex.is_match("hello")); assert!(!regex.is_match("hello 2.0")); } #[test] fn test_regex_find() { let regex = Regex::new("he(l+)o").unwrap(); assert_eq!(regex.find("hey, hello!"), Some((5, 10))); assert_eq!(regex.find("hey, honey!"), None); } #[test] fn test_regex_captures_len() { let regex = Regex::new("(he)(l+)(o)").unwrap(); assert_eq!(regex.captures_len(), 3); } #[test] fn test_regex_error_is_match() { let regex = Regex::new("(a|b|ab)*bc").unwrap(); let result = regex.match_with_param( "ababababababababababababababababababababababababababababacbc", 0, SearchOptions::SEARCH_OPTION_NONE, None, MatchParam::default()); let e = result.err().unwrap(); assert_eq!("retry-limit-in-match over", e.description()); } #[test] fn test_regex_panic_is_match() { let regex = Regex::new("(a|b|ab)*bc").unwrap(); let result = panic::catch_unwind(|| regex.is_match("ababababababababababababababababababababababababababababacbc") ); let e = result.err().unwrap(); let message = e.downcast_ref::().unwrap(); assert_eq!(message.as_str(), "Onig: Regex match error: retry-limit-in-match over"); } #[test] fn test_regex_error_find() { let regex = Regex::new("(a|b|ab)*bc").unwrap(); let s = "ababababababababababababababababababababababababababababacbc"; let result = regex.search_with_param( s, 0, s.len(), SearchOptions::SEARCH_OPTION_NONE, None, MatchParam::default()); let e = result.err().unwrap(); assert_eq!("retry-limit-in-match over", e.description()); } #[test] fn test_regex_panic_find() { let regex = Regex::new("(a|b|ab)*bc").unwrap(); let result = panic::catch_unwind(|| regex.find("ababababababababababababababababababababababababababababacbc") ); let e = result.err().unwrap(); let message = e.downcast_ref::().unwrap(); assert_eq!(message.as_str(), "Onig: Regex search error: retry-limit-in-match over"); } } onig-4.3.2/src/match_param.rs010064400007650000024000000034501341641542700143150ustar0000000000000000//! Match Parameters //! //! Contains the definition for the `MatchParam` struct. This can be //! used to control the behavior of searching and matching. use onig_sys; use std::os::raw::c_uint; /// Parameters for a Match or Search. pub struct MatchParam { raw: *mut onig_sys::OnigMatchParam, } impl MatchParam { /// Set the match stack limit pub fn set_match_stack_limit(&mut self, limit: u32) { unsafe { onig_sys::onig_set_match_stack_limit_size_of_match_param( self.raw, limit as c_uint ); } } /// Set the retry limit in match pub fn set_retry_limit_in_match(&mut self, limit: u32) { unsafe { onig_sys::onig_set_retry_limit_in_match_of_match_param( self.raw, limit as c_uint ); } } /// Get the Raw `OnigMatchParam` Pointer pub fn as_raw(&self) -> *const onig_sys::OnigMatchParam { self.raw } } impl Default for MatchParam { fn default() -> Self { let raw = unsafe { let new = onig_sys::onig_new_match_param(); onig_sys::onig_initialize_match_param(new); new }; MatchParam { raw } } } impl Drop for MatchParam { fn drop(&mut self) { unsafe { onig_sys::onig_free_match_param(self.raw); } } } #[cfg(test)] mod test { use super::*; #[test] pub fn create_default_match_param() { let _mp = MatchParam::default(); } #[test] pub fn set_max_stack_size_limit() { let mut mp = MatchParam::default(); mp.set_match_stack_limit(1000); } #[test] pub fn set_retry_limit_in_match() { let mut mp = MatchParam::default(); mp.set_retry_limit_in_match(1000); } } onig-4.3.2/src/names.rs010064400007650000024000000120311341644637000131400ustar0000000000000000use std::marker::PhantomData; use std::iter::Iterator; use std::ptr::null; use std::str::from_utf8_unchecked; use std::slice::from_raw_parts; use std::os::raw::{c_int, c_uchar, c_uint, c_void}; use onig_sys::{self, OnigRegex, OnigUChar}; use super::Regex; impl Regex { /// Returns the number of named groups into regex. pub fn capture_names_len(&self) -> usize { unsafe { onig_sys::onig_number_of_names(self.raw) as usize } } /// Returns the iterator over named groups as a tuple with the group name /// and group indexes. pub fn capture_names(&self) -> CaptureNames<'_> { CaptureNames { table: unsafe { (*self.raw).name_table as *const StTable }, bin_idx: -1, entry_ptr: null(), _phantom: PhantomData, } } /// Calls `callback` for each named group in the regex. Each callback gets the group name /// and group indices. pub fn foreach_name(&self, mut callback: F) -> i32 where F: FnMut(&str, &[u32]) -> bool, { extern "C" fn foreach_cb( name: *const OnigUChar, name_end: *const OnigUChar, ngroup_num: c_int, group_nums: *const c_int, _regex: OnigRegex, arg: *mut c_void, ) -> c_int where F: FnMut(&str, &[u32]) -> bool, { let name = unsafe { from_utf8_unchecked(from_raw_parts(name, name_end as usize - name as usize)) }; let groups = unsafe { from_raw_parts(group_nums as *const u32, ngroup_num as usize) }; let callback = unsafe { &mut *(arg as *mut F) }; if callback(name, groups) { 0 } else { -1 } } unsafe { onig_sys::onig_foreach_name( self.raw, foreach_cb::, &mut callback as *mut F as *mut c_void, ) } } } #[repr(C)] #[derive(Debug)] struct NameEntry { name: *const c_uchar, name_len: c_int, back_num: c_int, back_alloc: c_int, back_ref1: c_int, back_refs: *const c_int, } #[cfg(windows)] type StDataT = ::libc::uintptr_t; #[cfg(not(windows))] type StDataT = ::std::os::raw::c_ulong; #[repr(C)] #[derive(Debug)] struct StTableEntry { hash: c_uint, key: StDataT, record: StDataT, next: *const StTableEntry, } #[repr(C)] #[derive(Debug)] struct StTable { type_: *const c_void, num_bins: c_int, num_entries: c_int, bins: *const *const StTableEntry, } /// CaptureNames is an iterator over named groups as a tuple with the group name /// and group indexes. /// /// `'r` is the lifetime of the Regex object. #[derive(Debug)] pub struct CaptureNames<'r> { table: *const StTable, bin_idx: c_int, entry_ptr: *const StTableEntry, _phantom: PhantomData<&'r Regex>, } impl<'r> Iterator for CaptureNames<'r> { type Item = (&'r str, &'r [u32]); fn next(&mut self) -> Option<(&'r str, &'r [u32])> { unsafe { while self.entry_ptr.is_null() { if self.table.is_null() || self.bin_idx + 1 >= (*self.table).num_bins { return None; } self.bin_idx += 1; self.entry_ptr = *(*self.table).bins.offset(self.bin_idx as isize) } let entry = (*self.entry_ptr).record as *const NameEntry; let name = from_utf8_unchecked(from_raw_parts((*entry).name, (*entry).name_len as usize)); let groups = if (*entry).back_num > 1 { let ptr = (*entry).back_refs as *const u32; let len = (*entry).back_num as usize; from_raw_parts(ptr, len) } else { let ptr = &(*entry).back_ref1 as *const i32 as *const u32; from_raw_parts(ptr, 1) }; self.entry_ptr = (*self.entry_ptr).next; Some((name, groups)) } } fn size_hint(&self) -> (usize, Option) { if self.table.is_null() { (0, None) } else { let size = unsafe { (*self.table).num_bins } as usize; (size, Some(size)) } } } #[cfg(test)] mod tests { use super::super::*; #[test] fn test_regex_names_len() { let regex = Regex::new("(he)(l+)(o)").unwrap(); assert_eq!(regex.capture_names_len(), 0); let regex = Regex::new("(?he)(?l+)(?o)").unwrap(); assert_eq!(regex.capture_names_len(), 2); assert_eq!(regex.capture_histories_len(), 0); } #[test] fn test_regex_names() { let regex = Regex::new("(he)(l+)(o)").unwrap(); let names = regex.capture_names().collect::>(); assert_eq!(names, vec![]); let regex = Regex::new("(?he)(?l+)(?o)").unwrap(); let names = regex.capture_names().collect::>(); assert_eq!( names, [("foo", &[1u32] as &[u32]), ("bar", &[2u32, 3] as &[u32])] ); } } onig-4.3.2/src/pattern.rs010064400007650000024000000135471341641542700135260ustar0000000000000000use std::str::pattern::{Pattern, SearchStep, Searcher}; use super::{FindMatches, Regex}; /// Regex Searcher Type /// /// Represents the state of an ongoing search over a given string /// slice. pub struct RegexSearcher<'r, 'a> { iter: FindMatches<'r, 'a>, pos: usize, hay: &'a str, cached_match: Option<(usize, usize)>, } impl<'r, 'a> Pattern<'a> for &'r Regex { /// Searcher Type /// /// The searcher is the type responsible for returning an iterator /// of matches in a given string type Searcher = RegexSearcher<'r, 'a>; /// Into Searcher /// /// Creates a new searcher instance from this `Regex` pattern fn into_searcher(self, haystack: &'a str) -> Self::Searcher { RegexSearcher::new(self, haystack) } } impl<'r, 'a> RegexSearcher<'r, 'a> { /// New /// /// Create a regex searcher which uses the given regex to search a /// given pattern. pub fn new(reg: &'r Regex, haystack: &'a str) -> Self { RegexSearcher::<'r, 'a> { iter: reg.find_iter(haystack), pos: 0, hay: haystack, cached_match: None, } } } unsafe impl<'r, 'a> Searcher<'a> for RegexSearcher<'r, 'a> { /// Haystack Accessor /// /// Return the contained reference to the haystack being searched. fn haystack(&self) -> &'a str { self.hay } /// Next /// /// Returns the indexes of the next `Match` or `Reject` of the /// pattern within the haystack. fn next(&mut self) -> SearchStep { // if we have a cached match then return it straight away if let Some((start, end)) = self.cached_match { self.cached_match = None; self.pos = end; return SearchStep::Match(start, end); } // If we have no more haystack to search, we are done if self.pos >= self.hay.len() { return SearchStep::Done; } // Search based on the current position let next = self.iter.next(); match next { // we found a new match at the beginning of our slice, so // just return it straight away Some((start, end)) if start == self.pos => { self.pos = end; SearchStep::Match(start, end) } // We found a match later on in the slice. So cache it for // now and return a rejection up to the start of the // match Some((start, _)) => { self.cached_match = next; SearchStep::Reject(self.pos, start) } // We didn't find anything in the remainder of the // slice. So issue a rejection for the remaining buffer None => { let old_pos = self.pos; self.pos = self.hay.len(); SearchStep::Reject(old_pos, self.pos) } } } } #[cfg(test)] mod test { use Regex; use std::str::pattern::{Pattern, SearchStep, Searcher}; #[test] pub fn pattern_matches_in_str_returns_all_matches() { { let pattern = Regex::new("abc").unwrap(); let v: Vec<&str> = "abcXXXabcYYYabc".matches(&pattern).collect(); assert_eq!(v, ["abc", "abc", "abc"]); } { let pattern = Regex::new("a+").unwrap(); let v: Vec<&str> = ".a..aaa.a".matches(&pattern).collect(); assert_eq!(v, ["a", "aaa", "a"]); } } #[test] pub fn pattern_matches_with_index_returns_all_matches() { let pattern = Regex::new("[0-9]+").unwrap(); let v: Vec<(usize, &str)> = "hello 1234 12.34 3".match_indices(&pattern).collect(); assert_eq!(v, [(6, "1234"), (11, "12"), (14, "34"), (17, "3")]); } #[test] pub fn pattern_trim_matches_removes_matches() { { let pattern = Regex::new("a+").unwrap(); let trimmed = "aaaaworld".trim_left_matches(&pattern); assert_eq!(trimmed, "world"); } { let pattern = Regex::new("[ab]").unwrap(); let trimmed = "aabbbababtbaest".trim_left_matches(&pattern); assert_eq!(trimmed, "tbaest"); } { let pattern = Regex::new(r#"[ \t]"#).unwrap(); let trimmed = " \t".trim_left_matches(&pattern); assert_eq!(trimmed, ""); } } #[test] pub fn pattern_as_searcher_returns_expected_rejections() { { let reg = Regex::new("[ab]").unwrap(); let mut searcher = reg.into_searcher("a.b"); assert_eq!(searcher.next(), SearchStep::Match(0, 1)); assert_eq!(searcher.next(), SearchStep::Reject(1, 2)); assert_eq!(searcher.next(), SearchStep::Match(2, 3)); assert_eq!(searcher.next(), SearchStep::Done); } { let reg = Regex::new("test").unwrap(); let mut searcher = reg.into_searcher("this test string"); assert_eq!(searcher.next(), SearchStep::Reject(0, 5)); assert_eq!(searcher.next(), SearchStep::Match(5, 9)); assert_eq!(searcher.next(), SearchStep::Reject(9, 16)); assert_eq!(searcher.next(), SearchStep::Done); } } #[test] pub fn pattern_match_with_empty_matches() { let reg = Regex::new(r"\b").unwrap(); let matches: Vec<(usize, &str)> = "hello world".match_indices(®).collect(); assert_eq!(matches, [(0, ""), (5, ""), (6, ""), (11, "")]); } #[test] pub fn pattern_split_with_empty_matches() { let reg = Regex::new(r"e?").unwrap(); let split: Vec<&str> = "test".split(®).collect(); assert_eq!(split, ["", "t", "s", "t", ""]); } #[test] pub fn pattern_match_prefix_returns_true_when_regex_is_prefix() { let pattern = Regex::new("a+").unwrap(); assert!(pattern.is_prefix_of("aaaaaworld")); } } onig-4.3.2/src/region.rs010064400007650000024000000226371341641542700133340ustar0000000000000000#![allow(clippy::transmute_ptr_to_ref)] use std::ptr::null; use std::mem::transmute; use std::os::raw::c_int; use onig_sys; use super::CaptureTreeNode; use super::flags::TraverseCallbackAt; /// Represents a set of capture groups found in a search or match. #[derive(Debug, Eq, PartialEq)] pub struct Region { pub(crate) raw: onig_sys::OnigRegion, } impl Region { /// Create a new empty `Region` pub fn new() -> Region { Region { raw: onig_sys::OnigRegion { allocated: 0, num_regs: 0, beg: null(), end: null(), history_root: null(), }, } } /// Create a new region with a given capacity. This function allocates /// a new region object as in `Region::new` and resizes it to /// contain at least `capacity` regions. /// /// # Arguments /// /// * `capacity` - the number of captures this region should be /// capable of storing without allocation. pub fn with_capacity(capacity: usize) -> Region { let mut region = Self::new(); region.reserve(capacity); region } /// Clone From Raw /// /// Construct a new region based on an existing raw /// `*onig_sys::OnigRegion` pointer by copying. #[allow(clippy::not_unsafe_ptr_arg_deref)] pub fn clone_from_raw(ptr: *const onig_sys::OnigRegion) -> Self { let mut region = Self::new(); unsafe { onig_sys::onig_region_copy(&mut region.raw, ptr); } region } /// This can be used to clear out a region so it can be used /// again. See [`onig_sys::onig_region_clear`][region_clear] /// /// [region_clear]: ./onig_sys/fn.onig_region_clear.html pub fn clear(&mut self) { unsafe { onig_sys::onig_region_clear(&mut self.raw); } } /// Get the current capacity of the region. pub fn capacity(&self) -> usize { self.raw.allocated as usize } /// Updates the region to contain `new_capacity` slots. See /// [`onig_sys::onig_region_resize`][region_resize] for mor /// information. /// /// [region_resize]: ./onig_sys/fn.onig_region_resize.html /// /// # Arguments /// /// * `new_capacity` - The new number of groups in the region. pub fn reserve(&mut self, new_capacity: usize) { let r = unsafe { onig_sys::onig_region_resize(&mut self.raw, new_capacity as c_int) }; if r != onig_sys::ONIG_NORMAL { panic!("Onig: fail to memory allocation during region resize") } } /// Get the size of the region. /// /// Returns the number of registers in the region. pub fn len(&self) -> usize { self.raw.num_regs as usize } /// Check if the region is empty. /// /// Returns true if there are no registers in the region. pub fn is_empty(&self) -> bool { self.len() == 0 } /// Returns the start and end positions of the Nth capture group. /// /// Returns `None` if `pos` is not a valid capture group or if the /// capture group did not match anything. The positions returned /// are always byte indices with respect to the original string /// matched. pub fn pos(&self, pos: usize) -> Option<(usize, usize)> { if pos >= self.len() { return None; } let pos = pos as isize; let (beg, end) = unsafe { ( *self.raw.beg.offset(pos), *self.raw.end.offset(pos), ) }; if beg != onig_sys::ONIG_REGION_NOTPOS { Some((beg as usize, end as usize)) } else { None } } /// Get Capture Tree /// /// Returns the capture tree for this region, if there is one. pub fn tree(&self) -> Option<&CaptureTreeNode> { let tree = unsafe { onig_sys::onig_get_capture_tree(&self.raw) }; if tree.is_null() { None } else { Some(unsafe { transmute(tree) }) } } /// Get an iterator over the captures in the region. pub fn iter(&self) -> RegionIter<'_> { RegionIter { region: self, pos: 0, } } /// Walk the Tree of Captures /// /// The given callback is invoked for each node in the capture /// tree. Each node is passed to the callback before any children. pub fn tree_traverse(&self, callback: F) -> i32 where F: Fn(u32, (usize, usize), u32) -> bool, { self.tree_traverse_at(TraverseCallbackAt::CALLBACK_AT_FIRST, callback) } /// Walk the Tree of Captures in a Given Order /// /// The given callback is invoked for each node in the capture /// tree. The order in which the callback is invoked can be /// chosen. pub fn tree_traverse_at(&self, at: TraverseCallbackAt, mut callback: F) -> i32 where F: Fn(u32, (usize, usize), u32) -> bool, { use onig_sys::onig_capture_tree_traverse; use std::os::raw::{c_int, c_void}; extern "C" fn traverse_cb( group: c_int, beg: c_int, end: c_int, level: c_int, _at: c_int, ud: *mut c_void, ) -> c_int where F: Fn(u32, (usize, usize), u32) -> bool, { let callback = unsafe { &*(ud as *mut F) }; if callback(group as u32, (beg as usize, end as usize), level as u32) { 0 } else { -1 } } unsafe { onig_capture_tree_traverse( &self.raw, at.bits(), // ONIG_TRAVERSE_CALLBACK_AT_FIRST, traverse_cb::, &mut callback as *mut F as *mut c_void, ) } } } impl Default for Region { fn default() -> Self { Region::new() } } impl Drop for Region { fn drop(&mut self) { unsafe { onig_sys::onig_region_free(&mut self.raw, 0); } } } impl Clone for Region { fn clone(&self) -> Self { Self::clone_from_raw(&self.raw) } } impl<'a> IntoIterator for &'a Region { type Item = (usize, usize); type IntoIter = RegionIter<'a>; fn into_iter(self) -> Self::IntoIter { self.iter() } } /// Region Iterator /// /// This struct is responsible for holding iteration state over a /// given region. pub struct RegionIter<'a> { region: &'a Region, pos: usize, } impl<'a> Iterator for RegionIter<'a> { type Item = (usize, usize); fn next(&mut self) -> Option { let next = self.region.pos(self.pos); self.pos += 1; next } fn size_hint(&self) -> (usize, Option) { let len = self.region.len(); (len, Some(len)) } fn count(self) -> usize { self.region.len() } } #[cfg(test)] mod tests { use super::*; use super::super::{Regex, SearchOptions}; #[test] fn test_region_create() { Region::new(); } #[test] fn test_region_clear() { let mut region = Region::new(); region.clear(); } #[test] fn test_region_copy() { let region = Region::new(); let new_region = region.clone(); assert_eq!(new_region.len(), region.len()); } #[test] fn test_region_resize() { { let mut region = Region::new(); assert!(region.capacity() == 0); region.reserve(100); { // can still get the capacity without a mutable borrow let region_borrowed = ®ion; assert!(region_borrowed.capacity() == 100); } } { let region = Region::with_capacity(10); assert!(region.capacity() == 10); } } #[test] fn test_region_empty_iterate() { let region = Region::new(); for _ in ®ion { panic!("region should not contain any elements"); } } #[test] fn test_region_iter_returns_iterator() { let region = Region::new(); let all = region.iter().collect::>(); assert_eq!(all, Vec::new()); } #[test] fn test_region_iterate_with_captures() { let mut region = Region::new(); let reg = Regex::new("(a+)(b+)(c+)").unwrap(); let res = reg.search_with_options( "aaaabbbbc", 0, 9, SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), ); assert!(res.is_some()); let all = region.iter().collect::>(); assert_eq!(all, vec![(0, 9), (0, 4), (4, 8), (8, 9)]); } #[test] fn test_region_all_iteration_options() { let mut region = Region::new(); let reg = Regex::new("a(b)").unwrap(); let res = reg.search_with_options( "habitat", 0, 7, SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), ); assert!(res.is_some()); // collect into a vector by iterating with a for loop let mut a = Vec::<(usize, usize)>::new(); for pos in ®ion { a.push(pos) } // collect into a vector by using `iter` and collec let b = region.iter().collect::>(); let expected = vec![(1, 3), (2, 3)]; assert_eq!(expected, a); assert_eq!(expected, b); assert_eq!(2, region.iter().count()); } } onig-4.3.2/src/replace.rs010064400007650000024000000074511341641542700134610ustar0000000000000000use std::borrow::Cow; use super::{Captures, Regex}; /// Replacer describes types that can be used to replace matches in a string. /// /// Implementations are provided for replacement using string literals /// and `FnMut` callbacks. If this isn't enough for your replacement /// needs a user-supplied `Replacer` implemenation can be /// provided. For an example of a custom replacer implementation check /// out `examples/dollar.rs` in the Onig crate. pub trait Replacer { /// Returns a possibly owned string that is used to replace the match /// corresponding to the `caps` capture group. fn reg_replace(&mut self, caps: &Captures) -> Cow; } /// Replacement using Literal Strings impl<'t> Replacer for &'t str { fn reg_replace(&mut self, _: &Captures) -> Cow { (*self).into() } } /// Replacement using `FnMut` Callbacks impl Replacer for F where F: FnMut(&Captures) -> String, { fn reg_replace<'a>(&'a mut self, caps: &Captures) -> Cow<'a, str> { (*self)(caps).into() } } impl Regex { /// Replaces the leftmost-first match with the replacement provided. /// The replacement can be a regular string or a function that takes /// the matches `Captures` and returns the replaced string. /// /// If no match is found, then a copy of the string is returned unchanged. /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. /// In typical usage, this can just be a normal string: /// /// ```rust /// # extern crate onig; use onig::Regex; /// # fn main() { /// let re = Regex::new("[^01]+").unwrap(); /// assert_eq!(re.replace("1078910", ""), "1010"); /// # } /// ``` /// /// But anything satisfying the `Replacer` trait will work. For example, /// a closure of type `|&Captures| -> String` provides direct access to the /// captures corresponding to a match. This allows one to access /// submatches easily: /// /// ```rust /// # extern crate onig; use onig::Regex; /// # use onig::Captures; fn main() { /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { /// format!("{} {}", caps.at(2).unwrap_or(""), caps.at(1).unwrap_or("")) /// }); /// assert_eq!(result, "Bruce Springsteen"); /// # } /// ``` pub fn replace(&self, text: &str, rep: R) -> String { self.replacen(text, 1, rep) } /// Replaces all non-overlapping matches in `text` with the /// replacement provided. This is the same as calling `replacen` with /// `limit` set to `0`. /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement string. pub fn replace_all(&self, text: &str, rep: R) -> String { self.replacen(text, 0, rep) } /// Replaces at most `limit` non-overlapping matches in `text` with the /// replacement provided. If `limit` is 0, then all non-overlapping matches /// are replaced. /// /// See the documentation for `replace` for details on how to access /// submatches in the replacement string. pub fn replacen(&self, text: &str, limit: usize, mut rep: R) -> String { let mut new = String::with_capacity(text.len()); let mut last_match = 0; for (i, cap) in self.captures_iter(text).enumerate() { if limit > 0 && i >= limit { break; } // unwrap on 0 is OK because captures only reports matches let (s, e) = cap.pos(0).unwrap(); new.push_str(&text[last_match..s]); new.push_str(&rep.reg_replace(&cap)); last_match = e; } new.push_str(&text[last_match..]); new } } onig-4.3.2/src/syntax.rs010064400007650000024000000135231341641542700133710ustar0000000000000000#![allow(clippy::transmute_ptr_to_ptr)] #![allow(clippy::transmute_ptr_to_ref)] use std::mem::transmute; use onig_sys; use super::{MetaCharType, RegexOptions, SyntaxBehavior, SyntaxOperator}; /// Meta Character State /// /// Defines if a given meta character is enabled or not within a given /// syntax. If the character is enabled it also contains the rust /// `char` that it is set to. #[derive(Copy, Clone)] pub enum MetaChar { /// The meta character is set to the chosen `char` Character(char), /// The meta character is not enabled Ineffective, } /// Onig Syntax Wrapper /// /// Each syntax dfines a flavour of regex syntax. This type allows /// interaction with the built-in syntaxes through the static accessor /// functions (`Syntax::emacs()`, `Syntax::default()` etc.) and the /// creation of custom syntaxes. /// /// For a demonstration of creating a custom syntax see /// `examples/syntax.rs` in the main onig crate. #[derive(Debug, Clone, Copy)] pub struct Syntax { raw: onig_sys::OnigSyntaxType, } impl Syntax { /// Plain text syntax pub fn asis() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxASIS) } } /// POSIX Basic RE syntax pub fn posix_basic() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxPosixBasic) } } /// POSIX Extended RE syntax pub fn posix_extended() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxPosixExtended) } } /// Emacs syntax pub fn emacs() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxEmacs) } } /// Grep syntax pub fn grep() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxGrep) } } /// GNU regex syntax pub fn gnu_regex() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxGnuRegex) } } /// Java (Sun java.util.regex) syntax pub fn java() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxJava) } } /// Perl syntax pub fn perl() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxPerl) } } /// Perl + named group syntax pub fn perl_ng() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxPerl_NG) } } /// Ruby syntax pub fn ruby() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxRuby) } } /// Oniguruma Syntax pub fn oniguruma() -> &'static Syntax { unsafe { transmute(&onig_sys::OnigSyntaxOniguruma) } } /// Default syntax (Ruby syntax) pub fn default() -> &'static Syntax { unsafe { transmute(onig_sys::OnigDefaultSyntax) } } /// Retrieve the operators for this syntax pub fn operators(&self) -> SyntaxOperator { unsafe { let op = onig_sys::onig_get_syntax_op(&self.raw); let op2 = onig_sys::onig_get_syntax_op2(&self.raw); SyntaxOperator::from_bits_truncate(u64::from(op) + (u64::from(op2) << 32)) } } /// Replace the operators for this syntax pub fn set_operators(&mut self, operators: SyntaxOperator) { let op = operators.bits() as onig_sys::OnigSyntaxOp; let op2 = (operators.bits() >> 32) as onig_sys::OnigSyntaxOp2; unsafe { onig_sys::onig_set_syntax_op(&mut self.raw, op); onig_sys::onig_set_syntax_op2(&mut self.raw, op2) } } /// Enable Operators for this Syntax /// /// Updates the operators for this syntax to enable the chosen /// ones. pub fn enable_operators(&mut self, operators: SyntaxOperator) { let operators = self.operators() | operators; self.set_operators(operators) } /// Disable Operators for this Syntax /// /// Updates the operators for this syntax to remove the specified /// operators. pub fn disable_operators(&mut self, operators: SyntaxOperator) { let operators = self.operators() & !operators; self.set_operators(operators) } /// Retrieves the syntax behaviours pub fn behavior(&self) -> SyntaxBehavior { SyntaxBehavior::from_bits_truncate(unsafe { onig_sys::onig_get_syntax_behavior(&self.raw) }) } /// Overwrite the syntax behaviour for this syntax. pub fn set_behavior(&mut self, behavior: SyntaxBehavior) { let behavior = behavior.bits() as onig_sys::OnigSyntaxBehavior; unsafe { onig_sys::onig_set_syntax_behavior(&mut self.raw, behavior); } } /// Enable a given behaviour for this syntax pub fn enable_behavior(&mut self, behavior: SyntaxBehavior) { let behavior = self.behavior() | behavior; self.set_behavior(behavior) } /// Disable a given behaviour for this syntax pub fn disable_behavior(&mut self, behavior: SyntaxBehavior) { let behavior = self.behavior() & !behavior; self.set_behavior(behavior) } /// Retireve the syntax options for this syntax pub fn options(&self) -> RegexOptions { RegexOptions::from_bits_truncate(unsafe { onig_sys::onig_get_syntax_options(&self.raw) }) } /// Replace the syntax options for this syntax pub fn set_options(&mut self, options: RegexOptions) { let options = options.bits() as onig_sys::OnigOptionType; unsafe { onig_sys::onig_set_syntax_options(&mut self.raw, options); } } /// Set a given meta character's state /// /// Arguments: /// - `what`: The meta character to update /// - `meta`: The value to set the meta character to pub fn set_meta_char(&mut self, what: MetaCharType, meta: MetaChar) { let what = what.bits(); let code = match meta { MetaChar::Ineffective => onig_sys::ONIG_INEFFECTIVE_META_CHAR, MetaChar::Character(char) => char as u32, }; unsafe { onig_sys::onig_set_meta_char(&mut self.raw, what, code); } } } onig-4.3.2/src/tree.rs010064400007650000024000000060411341641542700127770ustar0000000000000000#![allow(clippy::transmute_ptr_to_ref)] use std::mem::transmute; use std::ops::Index; use std::iter::Iterator; use onig_sys; /// Capture Tree Node /// /// Represents a single node in the capture tree. Can be queried for /// information about the given capture and any child-captures that /// took place. #[repr(C)] #[derive(Debug)] pub struct CaptureTreeNode { raw: onig_sys::OnigCaptureTreeNode, } impl CaptureTreeNode { /// The capture group number for this capture pub fn group(&self) -> usize { self.raw.group as usize } /// The extent of this capture pub fn pos(&self) -> (usize, usize) { (self.raw.beg as usize, self.raw.end as usize) } /// The number of child captures this group contains pub fn len(&self) -> usize { self.raw.num_childs as usize } /// Does the node have any child captures? pub fn is_empty(&self) -> bool { self.len() == 0 } /// An iterator over thie children of this capture group pub fn children(&self) -> CaptureTreeNodeIter<'_> { CaptureTreeNodeIter { idx: 0, node: self } } } impl Index for CaptureTreeNode { type Output = CaptureTreeNode; fn index(&self, index: usize) -> &CaptureTreeNode { if index >= self.len() { panic!("capture tree node index overflow") } unsafe { transmute(*self.raw.childs.add(index)) } } } /// Caputres iterator #[derive(Debug)] pub struct CaptureTreeNodeIter<'t> { idx: usize, node: &'t CaptureTreeNode, } impl<'t> Iterator for CaptureTreeNodeIter<'t> { type Item = &'t CaptureTreeNode; fn next(&mut self) -> Option<&'t CaptureTreeNode> { if self.idx < self.node.len() { self.idx += 1; Some(&self.node[self.idx - 1]) } else { None } } fn size_hint(&self) -> (usize, Option) { let size = self.node.len(); (size, Some(size)) } } #[cfg(test)] mod tests { use super::super::*; #[test] fn test_regex_search_with_region_tree() { let mut region = Region::new(); let mut syntax = Syntax::ruby().clone(); syntax.enable_operators(SyntaxOperator::SYNTAX_OPERATOR_ATMARK_CAPTURE_HISTORY); let regex = Regex::with_options( "(?@a+(?@b+))|(?@c+(?@d+))", RegexOptions::REGEX_OPTION_NONE, &syntax, ).unwrap(); let r = regex.search_with_options( "- cd aaabbb -", 0, 13, SearchOptions::SEARCH_OPTION_NONE, Some(&mut region), ); assert_eq!(r, Some(2)); assert_eq!(region.len(), 5); let tree = region.tree().unwrap(); assert_eq!(tree.len(), 1); assert_eq!(tree.group(), 0); assert_eq!(tree.pos(), (2, 4)); assert_eq!(tree[0].len(), 1); assert_eq!(tree[0].group(), 3); assert_eq!(tree[0].pos(), (2, 4)); assert_eq!(tree[0][0].len(), 0); assert_eq!(tree[0][0].group(), 4); assert_eq!(tree[0][0].pos(), (3, 4)); } } onig-4.3.2/src/utils.rs010064400007650000024000000033501341641542700132000ustar0000000000000000use std::mem; use std::ffi::{CStr, CString}; use onig_sys; /// Get Version /// /// Returns the version information for the underlying Oniguruma /// API. This is separate from the Rust Onig and onig_sys versions. pub fn version() -> String { let raw_version = unsafe { CStr::from_ptr(onig_sys::onig_version()) }; raw_version.to_string_lossy().into_owned() } /// Get Copyright /// /// Returns copyright information for the underlying Oniguruma /// API. Rust onig is licensed seperately. For more information see /// LICENSE.md in the source distribution. pub fn copyright() -> String { let raw_copy = unsafe { CStr::from_ptr(onig_sys::onig_copyright()) }; raw_copy.to_string_lossy().into_owned() } pub type CodePointRange = (onig_sys::OnigCodePoint, onig_sys::OnigCodePoint); /// Create a User Defined Proeprty /// /// Creates a new user defined property from the given OnigCodePoint vlaues. pub fn define_user_property(name: &str, ranges: &[CodePointRange]) -> i32 { let mut raw_ranges = vec![ranges.len() as onig_sys::OnigCodePoint]; for &(start, end) in ranges { raw_ranges.push(start); raw_ranges.push(end); } let name = CString::new(name).unwrap(); let r = unsafe { onig_sys::onig_unicode_define_user_property(name.as_ptr(), raw_ranges.as_ptr()) }; // Deliberately leak the memory here as Onig expects to be able to // hang on to the pointer we just gave it. I'm not happy about it // but this does work and the amounts of memory leaked should be // trivial. mem::forget(raw_ranges); r } #[cfg(test)] mod tests { use super::*; #[test] pub fn utils_get_copyright_is_not_emtpy() { let copyright = copyright(); assert!(copyright.len() > 0); } }