grep-searcher-0.1.8/.cargo_vcs_info.json0000644000000001120000000000100135630ustar { "git": { "sha1": "dd47582619939c3f105a7161b2e313683d64aefe" } } grep-searcher-0.1.8/Cargo.lock0000644000000114270000000000100115510ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "aho-corasick" version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] [[package]] name = "bstr" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90682c8d613ad3373e66de8c6411e0ae2ab2571e879d2efbf73558cc66f21279" dependencies = [ "lazy_static", "memchr", "regex-automata", ] [[package]] name = "bytecount" version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72feb31ffc86498dacdbd0fcebb56138e7177a8cc5cea4516031d15ae85a742e" [[package]] name = "cfg-if" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "encoding_rs" version = "0.8.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" dependencies = [ "cfg-if 1.0.0", "packed_simd_2", ] [[package]] name = "encoding_rs_io" version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1cc3c5651fb62ab8aa3103998dade57efdd028544bd300516baa31840c252a83" dependencies = [ "encoding_rs", ] [[package]] name = "grep-matcher" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6d27563c33062cd33003b166ade2bb4fd82db1fd6a86db764dfdad132d46c1cc" dependencies = [ "memchr", ] [[package]] name = "grep-regex" version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "121553c9768c363839b92fc2d7cdbbad44a3b70e8d6e7b1b72b05c977527bd06" dependencies = [ "aho-corasick", "bstr", "grep-matcher", "log", "regex", "regex-syntax", "thread_local", ] [[package]] name = "grep-searcher" version = "0.1.8" dependencies = [ "bstr", "bytecount", "encoding_rs", "encoding_rs_io", "grep-matcher", "grep-regex", "log", "memmap2", "regex", ] [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" version = "0.2.97" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12b8adadd720df158f4d70dfe7ccc6adb0472d7c55ca83445f6a5ab3e36f8fb6" [[package]] name = "libm" version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fc7aa29613bd6a620df431842069224d8bc9011086b1db4c0e0cd47fa03ec9a" [[package]] name = "log" version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ "cfg-if 1.0.0", ] [[package]] name = "memchr" version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "memmap2" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20ff203f7bdc401350b1dbaa0355135777d25f41c0bbc601851bbd6cf61e8ff5" dependencies = [ "libc", ] [[package]] name = "once_cell" version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" [[package]] name = "packed_simd_2" version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e64858a2d3733fdd61adfdd6da89aa202f7ff0e741d2fc7ed1e452ba9dc99d7" dependencies = [ "cfg-if 0.1.10", "libm", ] [[package]] name = "regex" version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-syntax" version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "thread_local" version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8018d24e04c95ac8790716a5987d0fec4f8b27249ffa0f7d33f1369bdfb88cbd" dependencies = [ "once_cell", ] grep-searcher-0.1.8/Cargo.toml0000644000000030700000000000100115670ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "grep-searcher" version = "0.1.8" authors = ["Andrew Gallant "] description = "Fast line oriented regex searching as a library.\n" homepage = "https://github.com/BurntSushi/ripgrep/tree/master/crates/searcher" documentation = "https://docs.rs/grep-searcher" readme = "README.md" keywords = ["regex", "grep", "egrep", "search", "pattern"] license = "Unlicense/MIT" repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/searcher" [dependencies.bstr] version = "0.2.0" features = ["std"] default-features = false [dependencies.bytecount] version = "0.6" [dependencies.encoding_rs] version = "0.8.14" [dependencies.encoding_rs_io] version = "0.1.6" [dependencies.grep-matcher] version = "0.1.5" [dependencies.log] version = "0.4.5" [dependencies.memmap] version = "0.3.0" package = "memmap2" [dev-dependencies.grep-regex] version = "0.1.9" [dev-dependencies.regex] version = "1.1" [features] avx-accel = [] default = ["bytecount/runtime-dispatch-simd"] simd-accel = ["encoding_rs/simd-accel"] grep-searcher-0.1.8/Cargo.toml.orig000064400000000000000000000020510072674642500152760ustar 00000000000000[package] name = "grep-searcher" version = "0.1.8" #:version authors = ["Andrew Gallant "] description = """ Fast line oriented regex searching as a library. """ documentation = "https://docs.rs/grep-searcher" homepage = "https://github.com/BurntSushi/ripgrep/tree/master/crates/searcher" repository = "https://github.com/BurntSushi/ripgrep/tree/master/crates/searcher" readme = "README.md" keywords = ["regex", "grep", "egrep", "search", "pattern"] license = "Unlicense/MIT" edition = "2018" [dependencies] bstr = { version = "0.2.0", default-features = false, features = ["std"] } bytecount = "0.6" encoding_rs = "0.8.14" encoding_rs_io = "0.1.6" grep-matcher = { version = "0.1.5", path = "../matcher" } log = "0.4.5" memmap = { package = "memmap2", version = "0.3.0" } [dev-dependencies] grep-regex = { version = "0.1.9", path = "../regex" } regex = "1.1" [features] default = ["bytecount/runtime-dispatch-simd"] simd-accel = ["encoding_rs/simd-accel"] # This feature is DEPRECATED. Runtime dispatch is used for SIMD now. avx-accel = [] grep-searcher-0.1.8/LICENSE-MIT000064400000000000000000000020710072674642500140450ustar 00000000000000The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. grep-searcher-0.1.8/README.md000064400000000000000000000016120072674642500136700ustar 00000000000000grep-searcher ------------- A high level library for executing fast line oriented searches. This handles things like reporting contextual lines, counting lines, inverting a search, detecting binary data, automatic UTF-16 transcoding and deciding whether or not to use memory maps. [![Build status](https://github.com/BurntSushi/ripgrep/workflows/ci/badge.svg)](https://github.com/BurntSushi/ripgrep/actions) [![](https://img.shields.io/crates/v/grep-searcher.svg)](https://crates.io/crates/grep-searcher) Dual-licensed under MIT or the [UNLICENSE](https://unlicense.org/). ### Documentation [https://docs.rs/grep-searcher](https://docs.rs/grep-searcher) **NOTE:** You probably don't want to use this crate directly. Instead, you should prefer the facade defined in the [`grep`](https://docs.rs/grep) crate. ### Usage Add this to your `Cargo.toml`: ```toml [dependencies] grep-searcher = "0.1" ``` grep-searcher-0.1.8/UNLICENSE000064400000000000000000000022730072674642500136650ustar 00000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to grep-searcher-0.1.8/examples/search-stdin.rs000064400000000000000000000013610072674642500171620ustar 00000000000000use std::env; use std::error::Error; use std::io; use std::process; use grep_regex::RegexMatcher; use grep_searcher::sinks::UTF8; use grep_searcher::Searcher; fn main() { if let Err(err) = example() { eprintln!("{}", err); process::exit(1); } } fn example() -> Result<(), Box> { let pattern = match env::args().nth(1) { Some(pattern) => pattern, None => { return Err(From::from(format!("Usage: search-stdin "))) } }; let matcher = RegexMatcher::new(&pattern)?; Searcher::new().search_reader( &matcher, io::stdin(), UTF8(|lnum, line| { print!("{}:{}", lnum, line); Ok(true) }), )?; Ok(()) } grep-searcher-0.1.8/src/lib.rs000064400000000000000000000075550072674642500143300ustar 00000000000000/*! This crate provides an implementation of line oriented search, with optional support for multi-line search. # Brief overview The principle type in this crate is a [`Searcher`](struct.Searcher.html), which can be configured and built by a [`SearcherBuilder`](struct.SearcherBuilder.html). A `Searcher` is responsible for reading bytes from a source (e.g., a file), executing a search of those bytes using a `Matcher` (e.g., a regex) and then reporting the results of that search to a [`Sink`](trait.Sink.html) (e.g., stdout). The `Searcher` itself is principally responsible for managing the consumption of bytes from a source and applying a `Matcher` over those bytes in an efficient way. The `Searcher` is also responsible for inverting a search, counting lines, reporting contextual lines, detecting binary data and even deciding whether or not to use memory maps. A `Matcher` (which is defined in the [`grep-matcher`](https://crates.io/crates/grep-matcher) crate) is a trait for describing the lowest levels of pattern search in a generic way. The interface itself is very similar to the interface of a regular expression. For example, the [`grep-regex`](https://crates.io/crates/grep-regex) crate provides an implementation of the `Matcher` trait using Rust's [`regex`](https://crates.io/crates/regex) crate. Finally, a `Sink` describes how callers receive search results producer by a `Searcher`. This includes routines that are called at the beginning and end of a search, in addition to routines that are called when matching or contextual lines are found by the `Searcher`. Implementations of `Sink` can be trivially simple, or extraordinarily complex, such as the `Standard` printer found in the [`grep-printer`](https://crates.io/crates/grep-printer) crate, which effectively implements grep-like output. This crate also provides convenience `Sink` implementations in the [`sinks`](sinks/index.html) sub-module for easy searching with closures. # Example This example shows how to execute the searcher and read the search results using the [`UTF8`](sinks/struct.UTF8.html) implementation of `Sink`. ``` use std::error::Error; use grep_matcher::Matcher; use grep_regex::RegexMatcher; use grep_searcher::Searcher; use grep_searcher::sinks::UTF8; const SHERLOCK: &'static [u8] = b"\ For the Doctor Watsons of this world, as opposed to the Sherlock Holmeses, success in the province of detective work must always be, to a very large extent, the result of luck. Sherlock Holmes can extract a clew from a wisp of straw or a flake of cigar ash; but Doctor Watson has to have it taken out for him and dusted, and exhibited clearly, with a label attached. "; # fn main() { example().unwrap() } fn example() -> Result<(), Box> { let matcher = RegexMatcher::new(r"Doctor \w+")?; let mut matches: Vec<(u64, String)> = vec![]; Searcher::new().search_slice(&matcher, SHERLOCK, UTF8(|lnum, line| { // We are guaranteed to find a match, so the unwrap is OK. let mymatch = matcher.find(line.as_bytes())?.unwrap(); matches.push((lnum, line[mymatch].to_string())); Ok(true) }))?; assert_eq!(matches.len(), 2); assert_eq!( matches[0], (1, "Doctor Watsons".to_string()) ); assert_eq!( matches[1], (5, "Doctor Watson".to_string()) ); Ok(()) } ``` See also `examples/search-stdin.rs` from the root of this crate's directory to see a similar example that accepts a pattern on the command line and searches stdin. */ #![deny(missing_docs)] pub use crate::lines::{LineIter, LineStep}; pub use crate::searcher::{ BinaryDetection, ConfigError, Encoding, MmapChoice, Searcher, SearcherBuilder, }; pub use crate::sink::sinks; pub use crate::sink::{ Sink, SinkContext, SinkContextKind, SinkError, SinkFinish, SinkMatch, }; #[macro_use] mod macros; mod line_buffer; mod lines; mod searcher; mod sink; #[cfg(test)] mod testutil; grep-searcher-0.1.8/src/line_buffer.rs000064400000000000000000001033310072674642500160270ustar 00000000000000use std::cmp; use std::io; use bstr::ByteSlice; /// The default buffer capacity that we use for the line buffer. pub(crate) const DEFAULT_BUFFER_CAPACITY: usize = 64 * (1 << 10); // 64 KB /// The behavior of a searcher in the face of long lines and big contexts. /// /// When searching data incrementally using a fixed size buffer, this controls /// the amount of *additional* memory to allocate beyond the size of the buffer /// to accommodate lines (which may include the lines in a context window, when /// enabled) that do not fit in the buffer. /// /// The default is to eagerly allocate without a limit. #[derive(Clone, Copy, Debug)] pub enum BufferAllocation { /// Attempt to expand the size of the buffer until either at least the next /// line fits into memory or until all available memory is exhausted. /// /// This is the default. Eager, /// Limit the amount of additional memory allocated to the given size. If /// a line is found that requires more memory than is allowed here, then /// stop reading and return an error. Error(usize), } impl Default for BufferAllocation { fn default() -> BufferAllocation { BufferAllocation::Eager } } /// Create a new error to be used when a configured allocation limit has been /// reached. pub fn alloc_error(limit: usize) -> io::Error { let msg = format!("configured allocation limit ({}) exceeded", limit); io::Error::new(io::ErrorKind::Other, msg) } /// The behavior of binary detection in the line buffer. /// /// Binary detection is the process of _heuristically_ identifying whether a /// given chunk of data is binary or not, and then taking an action based on /// the result of that heuristic. The motivation behind detecting binary data /// is that binary data often indicates data that is undesirable to search /// using textual patterns. Of course, there are many cases in which this isn't /// true, which is why binary detection is disabled by default. #[derive(Clone, Copy, Debug)] pub enum BinaryDetection { /// No binary detection is performed. Data reported by the line buffer may /// contain arbitrary bytes. None, /// The given byte is searched in all contents read by the line buffer. If /// it occurs, then the data is considered binary and the line buffer acts /// as if it reached EOF. The line buffer guarantees that this byte will /// never be observable by callers. Quit(u8), /// The given byte is searched in all contents read by the line buffer. If /// it occurs, then it is replaced by the line terminator. The line buffer /// guarantees that this byte will never be observable by callers. Convert(u8), } impl Default for BinaryDetection { fn default() -> BinaryDetection { BinaryDetection::None } } impl BinaryDetection { /// Returns true if and only if the detection heuristic demands that /// the line buffer stop read data once binary data is observed. fn is_quit(&self) -> bool { match *self { BinaryDetection::Quit(_) => true, _ => false, } } } /// The configuration of a buffer. This contains options that are fixed once /// a buffer has been constructed. #[derive(Clone, Copy, Debug)] struct Config { /// The number of bytes to attempt to read at a time. capacity: usize, /// The line terminator. lineterm: u8, /// The behavior for handling long lines. buffer_alloc: BufferAllocation, /// When set, the presence of the given byte indicates binary content. binary: BinaryDetection, } impl Default for Config { fn default() -> Config { Config { capacity: DEFAULT_BUFFER_CAPACITY, lineterm: b'\n', buffer_alloc: BufferAllocation::default(), binary: BinaryDetection::default(), } } } /// A builder for constructing line buffers. #[derive(Clone, Debug, Default)] pub struct LineBufferBuilder { config: Config, } impl LineBufferBuilder { /// Create a new builder for a buffer. pub fn new() -> LineBufferBuilder { LineBufferBuilder { config: Config::default() } } /// Create a new line buffer from this builder's configuration. pub fn build(&self) -> LineBuffer { LineBuffer { config: self.config, buf: vec![0; self.config.capacity], pos: 0, last_lineterm: 0, end: 0, absolute_byte_offset: 0, binary_byte_offset: None, } } /// Set the default capacity to use for a buffer. /// /// In general, the capacity of a buffer corresponds to the amount of data /// to hold in memory, and the size of the reads to make to the underlying /// reader. /// /// This is set to a reasonable default and probably shouldn't be changed /// unless there's a specific reason to do so. pub fn capacity(&mut self, capacity: usize) -> &mut LineBufferBuilder { self.config.capacity = capacity; self } /// Set the line terminator for the buffer. /// /// Every buffer has a line terminator, and this line terminator is used /// to determine how to roll the buffer forward. For example, when a read /// to the buffer's underlying reader occurs, the end of the data that is /// read is likely to correspond to an incomplete line. As a line buffer, /// callers should not access this data since it is incomplete. The line /// terminator is how the line buffer determines the part of the read that /// is incomplete. /// /// By default, this is set to `b'\n'`. pub fn line_terminator(&mut self, lineterm: u8) -> &mut LineBufferBuilder { self.config.lineterm = lineterm; self } /// Set the maximum amount of additional memory to allocate for long lines. /// /// In order to enable line oriented search, a fundamental requirement is /// that, at a minimum, each line must be able to fit into memory. This /// setting controls how big that line is allowed to be. By default, this /// is set to `BufferAllocation::Eager`, which means a line buffer will /// attempt to allocate as much memory as possible to fit a line, and will /// only be limited by available memory. /// /// Note that this setting only applies to the amount of *additional* /// memory to allocate, beyond the capacity of the buffer. That means that /// a value of `0` is sensible, and in particular, will guarantee that a /// line buffer will never allocate additional memory beyond its initial /// capacity. pub fn buffer_alloc( &mut self, behavior: BufferAllocation, ) -> &mut LineBufferBuilder { self.config.buffer_alloc = behavior; self } /// Whether to enable binary detection or not. Depending on the setting, /// this can either cause the line buffer to report EOF early or it can /// cause the line buffer to clean the data. /// /// By default, this is disabled. In general, binary detection should be /// viewed as an imperfect heuristic. pub fn binary_detection( &mut self, detection: BinaryDetection, ) -> &mut LineBufferBuilder { self.config.binary = detection; self } } /// A line buffer reader efficiently reads a line oriented buffer from an /// arbitrary reader. #[derive(Debug)] pub struct LineBufferReader<'b, R> { rdr: R, line_buffer: &'b mut LineBuffer, } impl<'b, R: io::Read> LineBufferReader<'b, R> { /// Create a new buffered reader that reads from `rdr` and uses the given /// `line_buffer` as an intermediate buffer. /// /// This does not change the binary detection behavior of the given line /// buffer. pub fn new( rdr: R, line_buffer: &'b mut LineBuffer, ) -> LineBufferReader<'b, R> { line_buffer.clear(); LineBufferReader { rdr, line_buffer } } /// The absolute byte offset which corresponds to the starting offsets /// of the data returned by `buffer` relative to the beginning of the /// underlying reader's contents. As such, this offset does not generally /// correspond to an offset in memory. It is typically used for reporting /// purposes. It can also be used for counting the number of bytes that /// have been searched. pub fn absolute_byte_offset(&self) -> u64 { self.line_buffer.absolute_byte_offset() } /// If binary data was detected, then this returns the absolute byte offset /// at which binary data was initially found. pub fn binary_byte_offset(&self) -> Option { self.line_buffer.binary_byte_offset() } /// Fill the contents of this buffer by discarding the part of the buffer /// that has been consumed. The free space created by discarding the /// consumed part of the buffer is then filled with new data from the /// reader. /// /// If EOF is reached, then `false` is returned. Otherwise, `true` is /// returned. (Note that if this line buffer's binary detection is set to /// `Quit`, then the presence of binary data will cause this buffer to /// behave as if it had seen EOF at the first occurrence of binary data.) /// /// This forwards any errors returned by the underlying reader, and will /// also return an error if the buffer must be expanded past its allocation /// limit, as governed by the buffer allocation strategy. pub fn fill(&mut self) -> Result { self.line_buffer.fill(&mut self.rdr) } /// Return the contents of this buffer. pub fn buffer(&self) -> &[u8] { self.line_buffer.buffer() } /// Return the buffer as a BStr, used for convenient equality checking /// in tests only. #[cfg(test)] fn bstr(&self) -> &::bstr::BStr { self.buffer().as_bstr() } /// Consume the number of bytes provided. This must be less than or equal /// to the number of bytes returned by `buffer`. pub fn consume(&mut self, amt: usize) { self.line_buffer.consume(amt); } /// Consumes the remainder of the buffer. Subsequent calls to `buffer` are /// guaranteed to return an empty slice until the buffer is refilled. /// /// This is a convenience function for `consume(buffer.len())`. #[cfg(test)] fn consume_all(&mut self) { self.line_buffer.consume_all(); } } /// A line buffer manages a (typically fixed) buffer for holding lines. /// /// Callers should create line buffers sparingly and reuse them when possible. /// Line buffers cannot be used directly, but instead must be used via the /// LineBufferReader. #[derive(Clone, Debug)] pub struct LineBuffer { /// The configuration of this buffer. config: Config, /// The primary buffer with which to hold data. buf: Vec, /// The current position of this buffer. This is always a valid sliceable /// index into `buf`, and its maximum value is the length of `buf`. pos: usize, /// The end position of searchable content in this buffer. This is either /// set to just after the final line terminator in the buffer, or to just /// after the end of the last byte emitted by the reader when the reader /// has been exhausted. last_lineterm: usize, /// The end position of the buffer. This is always greater than or equal to /// last_lineterm. The bytes between last_lineterm and end, if any, always /// correspond to a partial line. end: usize, /// The absolute byte offset corresponding to `pos`. This is most typically /// not a valid index into addressable memory, but rather, an offset that /// is relative to all data that passes through a line buffer (since /// construction or since the last time `clear` was called). /// /// When the line buffer reaches EOF, this is set to the position just /// after the last byte read from the underlying reader. That is, it /// becomes the total count of bytes that have been read. absolute_byte_offset: u64, /// If binary data was found, this records the absolute byte offset at /// which it was first detected. binary_byte_offset: Option, } impl LineBuffer { /// Set the binary detection method used on this line buffer. /// /// This permits dynamically changing the binary detection strategy on /// an existing line buffer without needing to create a new one. pub fn set_binary_detection(&mut self, binary: BinaryDetection) { self.config.binary = binary; } /// Reset this buffer, such that it can be used with a new reader. fn clear(&mut self) { self.pos = 0; self.last_lineterm = 0; self.end = 0; self.absolute_byte_offset = 0; self.binary_byte_offset = None; } /// The absolute byte offset which corresponds to the starting offsets /// of the data returned by `buffer` relative to the beginning of the /// reader's contents. As such, this offset does not generally correspond /// to an offset in memory. It is typically used for reporting purposes, /// particularly in error messages. /// /// This is reset to `0` when `clear` is called. fn absolute_byte_offset(&self) -> u64 { self.absolute_byte_offset } /// If binary data was detected, then this returns the absolute byte offset /// at which binary data was initially found. fn binary_byte_offset(&self) -> Option { self.binary_byte_offset } /// Return the contents of this buffer. fn buffer(&self) -> &[u8] { &self.buf[self.pos..self.last_lineterm] } /// Return the contents of the free space beyond the end of the buffer as /// a mutable slice. fn free_buffer(&mut self) -> &mut [u8] { &mut self.buf[self.end..] } /// Consume the number of bytes provided. This must be less than or equal /// to the number of bytes returned by `buffer`. fn consume(&mut self, amt: usize) { assert!(amt <= self.buffer().len()); self.pos += amt; self.absolute_byte_offset += amt as u64; } /// Consumes the remainder of the buffer. Subsequent calls to `buffer` are /// guaranteed to return an empty slice until the buffer is refilled. /// /// This is a convenience function for `consume(buffer.len())`. #[cfg(test)] fn consume_all(&mut self) { let amt = self.buffer().len(); self.consume(amt); } /// Fill the contents of this buffer by discarding the part of the buffer /// that has been consumed. The free space created by discarding the /// consumed part of the buffer is then filled with new data from the given /// reader. /// /// Callers should provide the same reader to this line buffer in /// subsequent calls to fill. A different reader can only be used /// immediately following a call to `clear`. /// /// If EOF is reached, then `false` is returned. Otherwise, `true` is /// returned. (Note that if this line buffer's binary detection is set to /// `Quit`, then the presence of binary data will cause this buffer to /// behave as if it had seen EOF.) /// /// This forwards any errors returned by `rdr`, and will also return an /// error if the buffer must be expanded past its allocation limit, as /// governed by the buffer allocation strategy. fn fill(&mut self, mut rdr: R) -> Result { // If the binary detection heuristic tells us to quit once binary data // has been observed, then we no longer read new data and reach EOF // once the current buffer has been consumed. if self.config.binary.is_quit() && self.binary_byte_offset.is_some() { return Ok(!self.buffer().is_empty()); } self.roll(); assert_eq!(self.pos, 0); loop { self.ensure_capacity()?; let readlen = rdr.read(self.free_buffer().as_bytes_mut())?; if readlen == 0 { // We're only done reading for good once the caller has // consumed everything. self.last_lineterm = self.end; return Ok(!self.buffer().is_empty()); } // Get a mutable view into the bytes we've just read. These are // the bytes that we do binary detection on, and also the bytes we // search to find the last line terminator. We need a mutable slice // in the case of binary conversion. let oldend = self.end; self.end += readlen; let newbytes = &mut self.buf[oldend..self.end]; // Binary detection. match self.config.binary { BinaryDetection::None => {} // nothing to do BinaryDetection::Quit(byte) => { if let Some(i) = newbytes.find_byte(byte) { self.end = oldend + i; self.last_lineterm = self.end; self.binary_byte_offset = Some(self.absolute_byte_offset + self.end as u64); // If the first byte in our buffer is a binary byte, // then our buffer is empty and we should report as // such to the caller. return Ok(self.pos < self.end); } } BinaryDetection::Convert(byte) => { if let Some(i) = replace_bytes(newbytes, byte, self.config.lineterm) { // Record only the first binary offset. if self.binary_byte_offset.is_none() { self.binary_byte_offset = Some( self.absolute_byte_offset + (oldend + i) as u64, ); } } } } // Update our `last_lineterm` positions if we read one. if let Some(i) = newbytes.rfind_byte(self.config.lineterm) { self.last_lineterm = oldend + i + 1; return Ok(true); } // At this point, if we couldn't find a line terminator, then we // don't have a complete line. Therefore, we try to read more! } } /// Roll the unconsumed parts of the buffer to the front. /// /// This operation is idempotent. /// /// After rolling, `last_lineterm` and `end` point to the same location, /// and `pos` is always set to `0`. fn roll(&mut self) { if self.pos == self.end { self.pos = 0; self.last_lineterm = 0; self.end = 0; return; } let roll_len = self.end - self.pos; self.buf.copy_within_str(self.pos..self.end, 0); self.pos = 0; self.last_lineterm = roll_len; self.end = roll_len; } /// Ensures that the internal buffer has a non-zero amount of free space /// in which to read more data. If there is no free space, then more is /// allocated. If the allocation must exceed the configured limit, then /// this returns an error. fn ensure_capacity(&mut self) -> Result<(), io::Error> { if !self.free_buffer().is_empty() { return Ok(()); } // `len` is used for computing the next allocation size. The capacity // is permitted to start at `0`, so we make sure it's at least `1`. let len = cmp::max(1, self.buf.len()); let additional = match self.config.buffer_alloc { BufferAllocation::Eager => len * 2, BufferAllocation::Error(limit) => { let used = self.buf.len() - self.config.capacity; let n = cmp::min(len * 2, limit - used); if n == 0 { return Err(alloc_error(self.config.capacity + limit)); } n } }; assert!(additional > 0); let newlen = self.buf.len() + additional; self.buf.resize(newlen, 0); assert!(!self.free_buffer().is_empty()); Ok(()) } } /// Replaces `src` with `replacement` in bytes, and return the offset of the /// first replacement, if one exists. fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option { if src == replacement { return None; } let mut first_pos = None; let mut pos = 0; while let Some(i) = bytes[pos..].find_byte(src).map(|i| pos + i) { if first_pos.is_none() { first_pos = Some(i); } bytes[i] = replacement; pos = i + 1; while bytes.get(pos) == Some(&src) { bytes[pos] = replacement; pos += 1; } } first_pos } #[cfg(test)] mod tests { use super::*; use bstr::{ByteSlice, ByteVec}; use std::str; const SHERLOCK: &'static str = "\ For the Doctor Watsons of this world, as opposed to the Sherlock Holmeses, success in the province of detective work must always be, to a very large extent, the result of luck. Sherlock Holmes can extract a clew from a wisp of straw or a flake of cigar ash; but Doctor Watson has to have it taken out for him and dusted, and exhibited clearly, with a label attached.\ "; fn s(slice: &str) -> String { slice.to_string() } fn replace_str( slice: &str, src: u8, replacement: u8, ) -> (String, Option) { let mut dst = Vec::from(slice); let result = replace_bytes(&mut dst, src, replacement); (dst.into_string().unwrap(), result) } #[test] fn replace() { assert_eq!(replace_str("abc", b'b', b'z'), (s("azc"), Some(1))); assert_eq!(replace_str("abb", b'b', b'z'), (s("azz"), Some(1))); assert_eq!(replace_str("aba", b'a', b'z'), (s("zbz"), Some(0))); assert_eq!(replace_str("bbb", b'b', b'z'), (s("zzz"), Some(0))); assert_eq!(replace_str("bac", b'b', b'z'), (s("zac"), Some(0))); } #[test] fn buffer_basics1() { let bytes = "homer\nlisa\nmaggie"; let mut linebuf = LineBufferBuilder::new().build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\nlisa\n"); assert_eq!(rdr.absolute_byte_offset(), 0); rdr.consume(5); assert_eq!(rdr.absolute_byte_offset(), 5); rdr.consume_all(); assert_eq!(rdr.absolute_byte_offset(), 11); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "maggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } #[test] fn buffer_basics2() { let bytes = "homer\nlisa\nmaggie\n"; let mut linebuf = LineBufferBuilder::new().build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } #[test] fn buffer_basics3() { let bytes = "\n"; let mut linebuf = LineBufferBuilder::new().build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } #[test] fn buffer_basics4() { let bytes = "\n\n"; let mut linebuf = LineBufferBuilder::new().build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "\n\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } #[test] fn buffer_empty() { let bytes = ""; let mut linebuf = LineBufferBuilder::new().build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } #[test] fn buffer_zero_capacity() { let bytes = "homer\nlisa\nmaggie"; let mut linebuf = LineBufferBuilder::new().capacity(0).build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); while rdr.fill().unwrap() { rdr.consume_all(); } assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } #[test] fn buffer_small_capacity() { let bytes = "homer\nlisa\nmaggie"; let mut linebuf = LineBufferBuilder::new().capacity(1).build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); let mut got = vec![]; while rdr.fill().unwrap() { got.push_str(rdr.buffer()); rdr.consume_all(); } assert_eq!(bytes, got.as_bstr()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } #[test] fn buffer_limited_capacity1() { let bytes = "homer\nlisa\nmaggie"; let mut linebuf = LineBufferBuilder::new() .capacity(1) .buffer_alloc(BufferAllocation::Error(5)) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\n"); rdr.consume_all(); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "lisa\n"); rdr.consume_all(); // This returns an error because while we have just enough room to // store maggie in the buffer, we *don't* have enough room to read one // more byte, so we don't know whether we're at EOF or not, and // therefore must give up. assert!(rdr.fill().is_err()); // We can mush on though! assert_eq!(rdr.bstr(), "m"); rdr.consume_all(); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "aggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); } #[test] fn buffer_limited_capacity2() { let bytes = "homer\nlisa\nmaggie"; let mut linebuf = LineBufferBuilder::new() .capacity(1) .buffer_alloc(BufferAllocation::Error(6)) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\n"); rdr.consume_all(); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "lisa\n"); rdr.consume_all(); // We have just enough space. assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "maggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); } #[test] fn buffer_limited_capacity3() { let bytes = "homer\nlisa\nmaggie"; let mut linebuf = LineBufferBuilder::new() .capacity(1) .buffer_alloc(BufferAllocation::Error(0)) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.fill().is_err()); assert_eq!(rdr.bstr(), ""); } #[test] fn buffer_binary_none() { let bytes = "homer\nli\x00sa\nmaggie\n"; let mut linebuf = LineBufferBuilder::new().build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\nli\x00sa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), None); } #[test] fn buffer_binary_quit1() { let bytes = "homer\nli\x00sa\nmaggie\n"; let mut linebuf = LineBufferBuilder::new() .binary_detection(BinaryDetection::Quit(b'\x00')) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\nli"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), 8); assert_eq!(rdr.binary_byte_offset(), Some(8)); } #[test] fn buffer_binary_quit2() { let bytes = "\x00homer\nlisa\nmaggie\n"; let mut linebuf = LineBufferBuilder::new() .binary_detection(BinaryDetection::Quit(b'\x00')) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.bstr(), ""); assert_eq!(rdr.absolute_byte_offset(), 0); assert_eq!(rdr.binary_byte_offset(), Some(0)); } #[test] fn buffer_binary_quit3() { let bytes = "homer\nlisa\nmaggie\n\x00"; let mut linebuf = LineBufferBuilder::new() .binary_detection(BinaryDetection::Quit(b'\x00')) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64 - 1); assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 1)); } #[test] fn buffer_binary_quit4() { let bytes = "homer\nlisa\nmaggie\x00\n"; let mut linebuf = LineBufferBuilder::new() .binary_detection(BinaryDetection::Quit(b'\x00')) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64 - 2); assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 2)); } #[test] fn buffer_binary_quit5() { let mut linebuf = LineBufferBuilder::new() .binary_detection(BinaryDetection::Quit(b'u')) .build(); let mut rdr = LineBufferReader::new(SHERLOCK.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!( rdr.bstr(), "\ For the Doctor Watsons of this world, as opposed to the Sherlock Holmeses, s\ " ); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), 76); assert_eq!(rdr.binary_byte_offset(), Some(76)); assert_eq!(SHERLOCK.as_bytes()[76], b'u'); } #[test] fn buffer_binary_convert1() { let bytes = "homer\nli\x00sa\nmaggie\n"; let mut linebuf = LineBufferBuilder::new() .binary_detection(BinaryDetection::Convert(b'\x00')) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\nli\nsa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), Some(8)); } #[test] fn buffer_binary_convert2() { let bytes = "\x00homer\nlisa\nmaggie\n"; let mut linebuf = LineBufferBuilder::new() .binary_detection(BinaryDetection::Convert(b'\x00')) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "\nhomer\nlisa\nmaggie\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), Some(0)); } #[test] fn buffer_binary_convert3() { let bytes = "homer\nlisa\nmaggie\n\x00"; let mut linebuf = LineBufferBuilder::new() .binary_detection(BinaryDetection::Convert(b'\x00')) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 1)); } #[test] fn buffer_binary_convert4() { let bytes = "homer\nlisa\nmaggie\x00\n"; let mut linebuf = LineBufferBuilder::new() .binary_detection(BinaryDetection::Convert(b'\x00')) .build(); let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); assert!(rdr.buffer().is_empty()); assert!(rdr.fill().unwrap()); assert_eq!(rdr.bstr(), "homer\nlisa\nmaggie\n\n"); rdr.consume_all(); assert!(!rdr.fill().unwrap()); assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 2)); } } grep-searcher-0.1.8/src/lines.rs000064400000000000000000000401670072674642500146700ustar 00000000000000/*! A collection of routines for performing operations on lines. */ use bstr::ByteSlice; use bytecount; use grep_matcher::{LineTerminator, Match}; /// An iterator over lines in a particular slice of bytes. /// /// Line terminators are considered part of the line they terminate. All lines /// yielded by the iterator are guaranteed to be non-empty. /// /// `'b` refers to the lifetime of the underlying bytes. #[derive(Debug)] pub struct LineIter<'b> { bytes: &'b [u8], stepper: LineStep, } impl<'b> LineIter<'b> { /// Create a new line iterator that yields lines in the given bytes that /// are terminated by `line_term`. pub fn new(line_term: u8, bytes: &'b [u8]) -> LineIter<'b> { LineIter { bytes: bytes, stepper: LineStep::new(line_term, 0, bytes.len()), } } } impl<'b> Iterator for LineIter<'b> { type Item = &'b [u8]; fn next(&mut self) -> Option<&'b [u8]> { self.stepper.next_match(self.bytes).map(|m| &self.bytes[m]) } } /// An explicit iterator over lines in a particular slice of bytes. /// /// This iterator avoids borrowing the bytes themselves, and instead requires /// callers to explicitly provide the bytes when moving through the iterator. /// While not idiomatic, this provides a simple way of iterating over lines /// that doesn't require borrowing the slice itself, which can be convenient. /// /// Line terminators are considered part of the line they terminate. All lines /// yielded by the iterator are guaranteed to be non-empty. #[derive(Debug)] pub struct LineStep { line_term: u8, pos: usize, end: usize, } impl LineStep { /// Create a new line iterator over the given range of bytes using the /// given line terminator. /// /// Callers should provide the actual bytes for each call to `next`. The /// same slice must be provided to each call. /// /// This panics if `start` is not less than or equal to `end`. pub fn new(line_term: u8, start: usize, end: usize) -> LineStep { LineStep { line_term, pos: start, end: end } } /// Return the start and end position of the next line in the given bytes. /// /// The caller must past exactly the same slice of bytes for each call to /// `next`. /// /// The range returned includes the line terminator. Ranges are always /// non-empty. pub fn next(&mut self, bytes: &[u8]) -> Option<(usize, usize)> { self.next_impl(bytes) } /// Like next, but returns a `Match` instead of a tuple. #[inline(always)] pub(crate) fn next_match(&mut self, bytes: &[u8]) -> Option { self.next_impl(bytes).map(|(s, e)| Match::new(s, e)) } #[inline(always)] fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> { bytes = &bytes[..self.end]; match bytes[self.pos..].find_byte(self.line_term) { None => { if self.pos < bytes.len() { let m = (self.pos, bytes.len()); assert!(m.0 <= m.1); self.pos = m.1; Some(m) } else { None } } Some(line_end) => { let m = (self.pos, self.pos + line_end + 1); assert!(m.0 <= m.1); self.pos = m.1; Some(m) } } } } /// Count the number of occurrences of `line_term` in `bytes`. pub fn count(bytes: &[u8], line_term: u8) -> u64 { bytecount::count(bytes, line_term) as u64 } /// Given a line that possibly ends with a terminator, return that line without /// the terminator. #[inline(always)] pub fn without_terminator(bytes: &[u8], line_term: LineTerminator) -> &[u8] { let line_term = line_term.as_bytes(); let start = bytes.len().saturating_sub(line_term.len()); if bytes.get(start..) == Some(line_term) { return &bytes[..bytes.len() - line_term.len()]; } bytes } /// Return the start and end offsets of the lines containing the given range /// of bytes. /// /// Line terminators are considered part of the line they terminate. #[inline(always)] pub fn locate(bytes: &[u8], line_term: u8, range: Match) -> Match { let line_start = bytes[..range.start()].rfind_byte(line_term).map_or(0, |i| i + 1); let line_end = if range.end() > line_start && bytes[range.end() - 1] == line_term { range.end() } else { bytes[range.end()..] .find_byte(line_term) .map_or(bytes.len(), |i| range.end() + i + 1) }; Match::new(line_start, line_end) } /// Returns the minimal starting offset of the line that occurs `count` lines /// before the last line in `bytes`. /// /// Lines are terminated by `line_term`. If `count` is zero, then this returns /// the starting offset of the last line in `bytes`. /// /// If `bytes` ends with a line terminator, then the terminator itself is /// considered part of the last line. pub fn preceding(bytes: &[u8], line_term: u8, count: usize) -> usize { preceding_by_pos(bytes, bytes.len(), line_term, count) } /// Returns the minimal starting offset of the line that occurs `count` lines /// before the line containing `pos`. Lines are terminated by `line_term`. /// If `count` is zero, then this returns the starting offset of the line /// containing `pos`. /// /// If `pos` points just past a line terminator, then it is considered part of /// the line that it terminates. For example, given `bytes = b"abc\nxyz\n"` /// and `pos = 7`, `preceding(bytes, pos, b'\n', 0)` returns `4` (as does `pos /// = 8`) and `preceding(bytes, pos, `b'\n', 1)` returns `0`. fn preceding_by_pos( bytes: &[u8], mut pos: usize, line_term: u8, mut count: usize, ) -> usize { if pos == 0 { return 0; } else if bytes[pos - 1] == line_term { pos -= 1; } loop { match bytes[..pos].rfind_byte(line_term) { None => { return 0; } Some(i) => { if count == 0 { return i + 1; } else if i == 0 { return 0; } count -= 1; pos = i; } } } } #[cfg(test)] mod tests { use super::*; use grep_matcher::Match; use std::ops::Range; use std::str; const SHERLOCK: &'static str = "\ For the Doctor Watsons of this world, as opposed to the Sherlock Holmeses, success in the province of detective work must always be, to a very large extent, the result of luck. Sherlock Holmes can extract a clew from a wisp of straw or a flake of cigar ash; but Doctor Watson has to have it taken out for him and dusted, and exhibited clearly, with a label attached.\ "; fn m(start: usize, end: usize) -> Match { Match::new(start, end) } fn lines(text: &str) -> Vec<&str> { let mut results = vec![]; let mut it = LineStep::new(b'\n', 0, text.len()); while let Some(m) = it.next_match(text.as_bytes()) { results.push(&text[m]); } results } fn line_ranges(text: &str) -> Vec> { let mut results = vec![]; let mut it = LineStep::new(b'\n', 0, text.len()); while let Some(m) = it.next_match(text.as_bytes()) { results.push(m.start()..m.end()); } results } fn prev(text: &str, pos: usize, count: usize) -> usize { preceding_by_pos(text.as_bytes(), pos, b'\n', count) } fn loc(text: &str, start: usize, end: usize) -> Match { locate(text.as_bytes(), b'\n', Match::new(start, end)) } #[test] fn line_count() { assert_eq!(0, count(b"", b'\n')); assert_eq!(1, count(b"\n", b'\n')); assert_eq!(2, count(b"\n\n", b'\n')); assert_eq!(2, count(b"a\nb\nc", b'\n')); } #[test] fn line_locate() { let t = SHERLOCK; let lines = line_ranges(t); assert_eq!( loc(t, lines[0].start, lines[0].end), m(lines[0].start, lines[0].end) ); assert_eq!( loc(t, lines[0].start + 1, lines[0].end), m(lines[0].start, lines[0].end) ); assert_eq!( loc(t, lines[0].end - 1, lines[0].end), m(lines[0].start, lines[0].end) ); assert_eq!( loc(t, lines[0].end, lines[0].end), m(lines[1].start, lines[1].end) ); assert_eq!( loc(t, lines[5].start, lines[5].end), m(lines[5].start, lines[5].end) ); assert_eq!( loc(t, lines[5].start + 1, lines[5].end), m(lines[5].start, lines[5].end) ); assert_eq!( loc(t, lines[5].end - 1, lines[5].end), m(lines[5].start, lines[5].end) ); assert_eq!( loc(t, lines[5].end, lines[5].end), m(lines[5].start, lines[5].end) ); } #[test] fn line_locate_weird() { assert_eq!(loc("", 0, 0), m(0, 0)); assert_eq!(loc("\n", 0, 1), m(0, 1)); assert_eq!(loc("\n", 1, 1), m(1, 1)); assert_eq!(loc("\n\n", 0, 0), m(0, 1)); assert_eq!(loc("\n\n", 0, 1), m(0, 1)); assert_eq!(loc("\n\n", 1, 1), m(1, 2)); assert_eq!(loc("\n\n", 1, 2), m(1, 2)); assert_eq!(loc("\n\n", 2, 2), m(2, 2)); assert_eq!(loc("a\nb\nc", 0, 1), m(0, 2)); assert_eq!(loc("a\nb\nc", 1, 2), m(0, 2)); assert_eq!(loc("a\nb\nc", 2, 3), m(2, 4)); assert_eq!(loc("a\nb\nc", 3, 4), m(2, 4)); assert_eq!(loc("a\nb\nc", 4, 5), m(4, 5)); assert_eq!(loc("a\nb\nc", 5, 5), m(4, 5)); } #[test] fn line_iter() { assert_eq!(lines("abc"), vec!["abc"]); assert_eq!(lines("abc\n"), vec!["abc\n"]); assert_eq!(lines("abc\nxyz"), vec!["abc\n", "xyz"]); assert_eq!(lines("abc\nxyz\n"), vec!["abc\n", "xyz\n"]); assert_eq!(lines("abc\n\n"), vec!["abc\n", "\n"]); assert_eq!(lines("abc\n\n\n"), vec!["abc\n", "\n", "\n"]); assert_eq!(lines("abc\n\nxyz"), vec!["abc\n", "\n", "xyz"]); assert_eq!(lines("abc\n\nxyz\n"), vec!["abc\n", "\n", "xyz\n"]); assert_eq!(lines("abc\nxyz\n\n"), vec!["abc\n", "xyz\n", "\n"]); assert_eq!(lines("\n"), vec!["\n"]); assert_eq!(lines(""), Vec::<&str>::new()); } #[test] fn line_iter_empty() { let mut it = LineStep::new(b'\n', 0, 0); assert_eq!(it.next(b"abc"), None); } #[test] fn preceding_lines_doc() { // These are the examples mentions in the documentation of `preceding`. let bytes = b"abc\nxyz\n"; assert_eq!(4, preceding_by_pos(bytes, 7, b'\n', 0)); assert_eq!(4, preceding_by_pos(bytes, 8, b'\n', 0)); assert_eq!(0, preceding_by_pos(bytes, 7, b'\n', 1)); assert_eq!(0, preceding_by_pos(bytes, 8, b'\n', 1)); } #[test] fn preceding_lines_sherlock() { let t = SHERLOCK; let lines = line_ranges(t); // The following tests check the count == 0 case, i.e., finding the // beginning of the line containing the given position. assert_eq!(0, prev(t, 0, 0)); assert_eq!(0, prev(t, 1, 0)); // The line terminator is addressed by `end-1` and terminates the line // it is part of. assert_eq!(0, prev(t, lines[0].end - 1, 0)); assert_eq!(lines[0].start, prev(t, lines[0].end, 0)); // The end position of line addresses the byte immediately following a // line terminator, which puts it on the following line. assert_eq!(lines[1].start, prev(t, lines[0].end + 1, 0)); // Now tests for count > 0. assert_eq!(0, prev(t, 0, 1)); assert_eq!(0, prev(t, 0, 2)); assert_eq!(0, prev(t, 1, 1)); assert_eq!(0, prev(t, 1, 2)); assert_eq!(0, prev(t, lines[0].end - 1, 1)); assert_eq!(0, prev(t, lines[0].end - 1, 2)); assert_eq!(0, prev(t, lines[0].end, 1)); assert_eq!(0, prev(t, lines[0].end, 2)); assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1)); assert_eq!(lines[3].start, prev(t, lines[4].end, 1)); assert_eq!(lines[4].start, prev(t, lines[4].end + 1, 1)); // The last line has no line terminator. assert_eq!(lines[5].start, prev(t, lines[5].end, 0)); assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0)); assert_eq!(lines[4].start, prev(t, lines[5].end, 1)); assert_eq!(lines[0].start, prev(t, lines[5].end, 5)); } #[test] fn preceding_lines_short() { let t = "a\nb\nc\nd\ne\nf\n"; let lines = line_ranges(t); assert_eq!(12, t.len()); assert_eq!(lines[5].start, prev(t, lines[5].end, 0)); assert_eq!(lines[4].start, prev(t, lines[5].end, 1)); assert_eq!(lines[3].start, prev(t, lines[5].end, 2)); assert_eq!(lines[2].start, prev(t, lines[5].end, 3)); assert_eq!(lines[1].start, prev(t, lines[5].end, 4)); assert_eq!(lines[0].start, prev(t, lines[5].end, 5)); assert_eq!(lines[0].start, prev(t, lines[5].end, 6)); assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0)); assert_eq!(lines[4].start, prev(t, lines[5].end - 1, 1)); assert_eq!(lines[3].start, prev(t, lines[5].end - 1, 2)); assert_eq!(lines[2].start, prev(t, lines[5].end - 1, 3)); assert_eq!(lines[1].start, prev(t, lines[5].end - 1, 4)); assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 5)); assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 6)); assert_eq!(lines[4].start, prev(t, lines[5].start, 0)); assert_eq!(lines[3].start, prev(t, lines[5].start, 1)); assert_eq!(lines[2].start, prev(t, lines[5].start, 2)); assert_eq!(lines[1].start, prev(t, lines[5].start, 3)); assert_eq!(lines[0].start, prev(t, lines[5].start, 4)); assert_eq!(lines[0].start, prev(t, lines[5].start, 5)); assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1)); assert_eq!(lines[2].start, prev(t, lines[4].start, 1)); assert_eq!(lines[2].start, prev(t, lines[3].end - 1, 1)); assert_eq!(lines[1].start, prev(t, lines[3].start, 1)); assert_eq!(lines[1].start, prev(t, lines[2].end - 1, 1)); assert_eq!(lines[0].start, prev(t, lines[2].start, 1)); assert_eq!(lines[0].start, prev(t, lines[1].end - 1, 1)); assert_eq!(lines[0].start, prev(t, lines[1].start, 1)); assert_eq!(lines[0].start, prev(t, lines[0].end - 1, 1)); assert_eq!(lines[0].start, prev(t, lines[0].start, 1)); } #[test] fn preceding_lines_empty1() { let t = "\n\n\nd\ne\nf\n"; let lines = line_ranges(t); assert_eq!(9, t.len()); assert_eq!(lines[0].start, prev(t, lines[0].end, 0)); assert_eq!(lines[0].start, prev(t, lines[0].end, 1)); assert_eq!(lines[1].start, prev(t, lines[1].end, 0)); assert_eq!(lines[0].start, prev(t, lines[1].end, 1)); assert_eq!(lines[5].start, prev(t, lines[5].end, 0)); assert_eq!(lines[4].start, prev(t, lines[5].end, 1)); assert_eq!(lines[3].start, prev(t, lines[5].end, 2)); assert_eq!(lines[2].start, prev(t, lines[5].end, 3)); assert_eq!(lines[1].start, prev(t, lines[5].end, 4)); assert_eq!(lines[0].start, prev(t, lines[5].end, 5)); assert_eq!(lines[0].start, prev(t, lines[5].end, 6)); } #[test] fn preceding_lines_empty2() { let t = "a\n\n\nd\ne\nf\n"; let lines = line_ranges(t); assert_eq!(10, t.len()); assert_eq!(lines[0].start, prev(t, lines[0].end, 0)); assert_eq!(lines[0].start, prev(t, lines[0].end, 1)); assert_eq!(lines[1].start, prev(t, lines[1].end, 0)); assert_eq!(lines[0].start, prev(t, lines[1].end, 1)); assert_eq!(lines[5].start, prev(t, lines[5].end, 0)); assert_eq!(lines[4].start, prev(t, lines[5].end, 1)); assert_eq!(lines[3].start, prev(t, lines[5].end, 2)); assert_eq!(lines[2].start, prev(t, lines[5].end, 3)); assert_eq!(lines[1].start, prev(t, lines[5].end, 4)); assert_eq!(lines[0].start, prev(t, lines[5].end, 5)); assert_eq!(lines[0].start, prev(t, lines[5].end, 6)); } } grep-searcher-0.1.8/src/macros.rs000064400000000000000000000013430072674642500150330ustar 00000000000000/// Like assert_eq, but nicer output for long strings. #[cfg(test)] #[macro_export] macro_rules! assert_eq_printed { ($expected:expr, $got:expr, $($tt:tt)*) => { let expected = &*$expected; let got = &*$got; let label = format!($($tt)*); if expected != got { panic!(" printed outputs differ! (label: {}) expected: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ {} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ got: ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ {} ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ", label, expected, got); } } } grep-searcher-0.1.8/src/searcher/core.rs000064400000000000000000000452330072674642500163010ustar 00000000000000use std::cmp; use bstr::ByteSlice; use crate::line_buffer::BinaryDetection; use crate::lines::{self, LineStep}; use crate::searcher::{Config, Range, Searcher}; use crate::sink::{ Sink, SinkContext, SinkContextKind, SinkError, SinkFinish, SinkMatch, }; use grep_matcher::{LineMatchKind, Matcher}; #[derive(Debug)] pub struct Core<'s, M: 's, S> { config: &'s Config, matcher: M, searcher: &'s Searcher, sink: S, binary: bool, pos: usize, absolute_byte_offset: u64, binary_byte_offset: Option, line_number: Option, last_line_counted: usize, last_line_visited: usize, after_context_left: usize, has_sunk: bool, } impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { pub fn new( searcher: &'s Searcher, matcher: M, sink: S, binary: bool, ) -> Core<'s, M, S> { let line_number = if searcher.config.line_number { Some(1) } else { None }; let core = Core { config: &searcher.config, matcher: matcher, searcher: searcher, sink: sink, binary: binary, pos: 0, absolute_byte_offset: 0, binary_byte_offset: None, line_number: line_number, last_line_counted: 0, last_line_visited: 0, after_context_left: 0, has_sunk: false, }; if !core.searcher.multi_line_with_matcher(&core.matcher) { if core.is_line_by_line_fast() { log::trace!("searcher core: will use fast line searcher"); } else { log::trace!("searcher core: will use slow line searcher"); } } core } pub fn pos(&self) -> usize { self.pos } pub fn set_pos(&mut self, pos: usize) { self.pos = pos; } pub fn binary_byte_offset(&self) -> Option { self.binary_byte_offset.map(|offset| offset as u64) } pub fn matcher(&self) -> &M { &self.matcher } pub fn matched( &mut self, buf: &[u8], range: &Range, ) -> Result { self.sink_matched(buf, range) } pub fn binary_data( &mut self, binary_byte_offset: u64, ) -> Result { self.sink.binary_data(&self.searcher, binary_byte_offset) } pub fn begin(&mut self) -> Result { self.sink.begin(&self.searcher) } pub fn finish( &mut self, byte_count: u64, binary_byte_offset: Option, ) -> Result<(), S::Error> { self.sink.finish( &self.searcher, &SinkFinish { byte_count, binary_byte_offset }, ) } pub fn match_by_line(&mut self, buf: &[u8]) -> Result { if self.is_line_by_line_fast() { self.match_by_line_fast(buf) } else { self.match_by_line_slow(buf) } } pub fn roll(&mut self, buf: &[u8]) -> usize { let consumed = if self.config.max_context() == 0 { buf.len() } else { // It might seem like all we need to care about here is just // the "before context," but in order to sink the context // separator (when before_context==0 and after_context>0), we // need to know something about the position of the previous // line visited, even if we're at the beginning of the buffer. let context_start = lines::preceding( buf, self.config.line_term.as_byte(), self.config.max_context(), ); let consumed = cmp::max(context_start, self.last_line_visited); consumed }; self.count_lines(buf, consumed); self.absolute_byte_offset += consumed as u64; self.last_line_counted = 0; self.last_line_visited = 0; self.set_pos(buf.len() - consumed); consumed } pub fn detect_binary( &mut self, buf: &[u8], range: &Range, ) -> Result { if self.binary_byte_offset.is_some() { return Ok(self.config.binary.quit_byte().is_some()); } let binary_byte = match self.config.binary.0 { BinaryDetection::Quit(b) => b, BinaryDetection::Convert(b) => b, _ => return Ok(false), }; if let Some(i) = buf[*range].find_byte(binary_byte) { let offset = range.start() + i; self.binary_byte_offset = Some(offset); if !self.binary_data(offset as u64)? { return Ok(true); } Ok(self.config.binary.quit_byte().is_some()) } else { Ok(false) } } pub fn before_context_by_line( &mut self, buf: &[u8], upto: usize, ) -> Result { if self.config.before_context == 0 { return Ok(true); } let range = Range::new(self.last_line_visited, upto); if range.is_empty() { return Ok(true); } let before_context_start = range.start() + lines::preceding( &buf[range], self.config.line_term.as_byte(), self.config.before_context - 1, ); let range = Range::new(before_context_start, range.end()); let mut stepper = LineStep::new( self.config.line_term.as_byte(), range.start(), range.end(), ); while let Some(line) = stepper.next_match(buf) { if !self.sink_break_context(line.start())? { return Ok(false); } if !self.sink_before_context(buf, &line)? { return Ok(false); } } Ok(true) } pub fn after_context_by_line( &mut self, buf: &[u8], upto: usize, ) -> Result { if self.after_context_left == 0 { return Ok(true); } let range = Range::new(self.last_line_visited, upto); let mut stepper = LineStep::new( self.config.line_term.as_byte(), range.start(), range.end(), ); while let Some(line) = stepper.next_match(buf) { if !self.sink_after_context(buf, &line)? { return Ok(false); } if self.after_context_left == 0 { break; } } Ok(true) } pub fn other_context_by_line( &mut self, buf: &[u8], upto: usize, ) -> Result { let range = Range::new(self.last_line_visited, upto); let mut stepper = LineStep::new( self.config.line_term.as_byte(), range.start(), range.end(), ); while let Some(line) = stepper.next_match(buf) { if !self.sink_other_context(buf, &line)? { return Ok(false); } } Ok(true) } fn match_by_line_slow(&mut self, buf: &[u8]) -> Result { debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher)); let range = Range::new(self.pos(), buf.len()); let mut stepper = LineStep::new( self.config.line_term.as_byte(), range.start(), range.end(), ); while let Some(line) = stepper.next_match(buf) { let matched = { // Stripping the line terminator is necessary to prevent some // classes of regexes from matching the empty position *after* // the end of the line. For example, `(?m)^$` will match at // position (2, 2) in the string `a\n`. let slice = lines::without_terminator( &buf[line], self.config.line_term, ); match self.matcher.shortest_match(slice) { Err(err) => return Err(S::Error::error_message(err)), Ok(result) => result.is_some(), } }; self.set_pos(line.end()); if matched != self.config.invert_match { if !self.before_context_by_line(buf, line.start())? { return Ok(false); } if !self.sink_matched(buf, &line)? { return Ok(false); } } else if self.after_context_left >= 1 { if !self.sink_after_context(buf, &line)? { return Ok(false); } } else if self.config.passthru { if !self.sink_other_context(buf, &line)? { return Ok(false); } } } Ok(true) } fn match_by_line_fast(&mut self, buf: &[u8]) -> Result { debug_assert!(!self.config.passthru); while !buf[self.pos()..].is_empty() { if self.config.invert_match { if !self.match_by_line_fast_invert(buf)? { return Ok(false); } } else if let Some(line) = self.find_by_line_fast(buf)? { if self.config.max_context() > 0 { if !self.after_context_by_line(buf, line.start())? { return Ok(false); } if !self.before_context_by_line(buf, line.start())? { return Ok(false); } } self.set_pos(line.end()); if !self.sink_matched(buf, &line)? { return Ok(false); } } else { break; } } if !self.after_context_by_line(buf, buf.len())? { return Ok(false); } self.set_pos(buf.len()); Ok(true) } #[inline(always)] fn match_by_line_fast_invert( &mut self, buf: &[u8], ) -> Result { assert!(self.config.invert_match); let invert_match = match self.find_by_line_fast(buf)? { None => { let range = Range::new(self.pos(), buf.len()); self.set_pos(range.end()); range } Some(line) => { let range = Range::new(self.pos(), line.start()); self.set_pos(line.end()); range } }; if invert_match.is_empty() { return Ok(true); } if !self.after_context_by_line(buf, invert_match.start())? { return Ok(false); } if !self.before_context_by_line(buf, invert_match.start())? { return Ok(false); } let mut stepper = LineStep::new( self.config.line_term.as_byte(), invert_match.start(), invert_match.end(), ); while let Some(line) = stepper.next_match(buf) { if !self.sink_matched(buf, &line)? { return Ok(false); } } Ok(true) } #[inline(always)] fn find_by_line_fast( &self, buf: &[u8], ) -> Result, S::Error> { debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher)); debug_assert!(self.is_line_by_line_fast()); let mut pos = self.pos(); while !buf[pos..].is_empty() { match self.matcher.find_candidate_line(&buf[pos..]) { Err(err) => return Err(S::Error::error_message(err)), Ok(None) => return Ok(None), Ok(Some(LineMatchKind::Confirmed(i))) => { let line = lines::locate( buf, self.config.line_term.as_byte(), Range::zero(i).offset(pos), ); // If we matched beyond the end of the buffer, then we // don't report this as a match. if line.start() == buf.len() { pos = buf.len(); continue; } return Ok(Some(line)); } Ok(Some(LineMatchKind::Candidate(i))) => { let line = lines::locate( buf, self.config.line_term.as_byte(), Range::zero(i).offset(pos), ); // We need to strip the line terminator here to match the // semantics of line-by-line searching. Namely, regexes // like `(?m)^$` can match at the final position beyond a // line terminator, which is non-sensical in line oriented // matching. let slice = lines::without_terminator( &buf[line], self.config.line_term, ); match self.matcher.is_match(slice) { Err(err) => return Err(S::Error::error_message(err)), Ok(true) => return Ok(Some(line)), Ok(false) => { pos = line.end(); continue; } } } } } Ok(None) } #[inline(always)] fn sink_matched( &mut self, buf: &[u8], range: &Range, ) -> Result { if self.binary && self.detect_binary(buf, range)? { return Ok(false); } if !self.sink_break_context(range.start())? { return Ok(false); } self.count_lines(buf, range.start()); let offset = self.absolute_byte_offset + range.start() as u64; let linebuf = &buf[*range]; let keepgoing = self.sink.matched( &self.searcher, &SinkMatch { line_term: self.config.line_term, bytes: linebuf, absolute_byte_offset: offset, line_number: self.line_number, buffer: buf, bytes_range_in_buffer: range.start()..range.end(), }, )?; if !keepgoing { return Ok(false); } self.last_line_visited = range.end(); self.after_context_left = self.config.after_context; self.has_sunk = true; Ok(true) } fn sink_before_context( &mut self, buf: &[u8], range: &Range, ) -> Result { if self.binary && self.detect_binary(buf, range)? { return Ok(false); } self.count_lines(buf, range.start()); let offset = self.absolute_byte_offset + range.start() as u64; let keepgoing = self.sink.context( &self.searcher, &SinkContext { line_term: self.config.line_term, bytes: &buf[*range], kind: SinkContextKind::Before, absolute_byte_offset: offset, line_number: self.line_number, }, )?; if !keepgoing { return Ok(false); } self.last_line_visited = range.end(); self.has_sunk = true; Ok(true) } fn sink_after_context( &mut self, buf: &[u8], range: &Range, ) -> Result { assert!(self.after_context_left >= 1); if self.binary && self.detect_binary(buf, range)? { return Ok(false); } self.count_lines(buf, range.start()); let offset = self.absolute_byte_offset + range.start() as u64; let keepgoing = self.sink.context( &self.searcher, &SinkContext { line_term: self.config.line_term, bytes: &buf[*range], kind: SinkContextKind::After, absolute_byte_offset: offset, line_number: self.line_number, }, )?; if !keepgoing { return Ok(false); } self.last_line_visited = range.end(); self.after_context_left -= 1; self.has_sunk = true; Ok(true) } fn sink_other_context( &mut self, buf: &[u8], range: &Range, ) -> Result { if self.binary && self.detect_binary(buf, range)? { return Ok(false); } self.count_lines(buf, range.start()); let offset = self.absolute_byte_offset + range.start() as u64; let keepgoing = self.sink.context( &self.searcher, &SinkContext { line_term: self.config.line_term, bytes: &buf[*range], kind: SinkContextKind::Other, absolute_byte_offset: offset, line_number: self.line_number, }, )?; if !keepgoing { return Ok(false); } self.last_line_visited = range.end(); self.has_sunk = true; Ok(true) } fn sink_break_context( &mut self, start_of_line: usize, ) -> Result { let is_gap = self.last_line_visited < start_of_line; let any_context = self.config.before_context > 0 || self.config.after_context > 0; if !any_context || !self.has_sunk || !is_gap { Ok(true) } else { self.sink.context_break(&self.searcher) } } fn count_lines(&mut self, buf: &[u8], upto: usize) { if let Some(ref mut line_number) = self.line_number { if self.last_line_counted >= upto { return; } let slice = &buf[self.last_line_counted..upto]; let count = lines::count(slice, self.config.line_term.as_byte()); *line_number += count; self.last_line_counted = upto; } } fn is_line_by_line_fast(&self) -> bool { debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher)); if self.config.passthru { return false; } if let Some(line_term) = self.matcher.line_terminator() { if line_term == self.config.line_term { return true; } } if let Some(non_matching) = self.matcher.non_matching_bytes() { // If the line terminator is CRLF, we don't actually need to care // whether the regex can match `\r` or not. Namely, a `\r` is // neither necessary nor sufficient to terminate a line. A `\n` is // always required. if non_matching.contains(self.config.line_term.as_byte()) { return true; } } false } } grep-searcher-0.1.8/src/searcher/glue.rs000064400000000000000000001377720072674642500163170ustar 00000000000000use std::cmp; use std::io; use crate::line_buffer::{LineBufferReader, DEFAULT_BUFFER_CAPACITY}; use crate::lines::{self, LineStep}; use crate::sink::{Sink, SinkError}; use grep_matcher::Matcher; use crate::searcher::core::Core; use crate::searcher::{Config, Range, Searcher}; #[derive(Debug)] pub struct ReadByLine<'s, M, R, S> { config: &'s Config, core: Core<'s, M, S>, rdr: LineBufferReader<'s, R>, } impl<'s, M, R, S> ReadByLine<'s, M, R, S> where M: Matcher, R: io::Read, S: Sink, { pub fn new( searcher: &'s Searcher, matcher: M, read_from: LineBufferReader<'s, R>, write_to: S, ) -> ReadByLine<'s, M, R, S> { debug_assert!(!searcher.multi_line_with_matcher(&matcher)); ReadByLine { config: &searcher.config, core: Core::new(searcher, matcher, write_to, false), rdr: read_from, } } pub fn run(mut self) -> Result<(), S::Error> { if self.core.begin()? { while self.fill()? && self.core.match_by_line(self.rdr.buffer())? { } } self.core.finish( self.rdr.absolute_byte_offset(), self.rdr.binary_byte_offset(), ) } fn fill(&mut self) -> Result { assert!(self.rdr.buffer()[self.core.pos()..].is_empty()); let already_binary = self.rdr.binary_byte_offset().is_some(); let old_buf_len = self.rdr.buffer().len(); let consumed = self.core.roll(self.rdr.buffer()); self.rdr.consume(consumed); let didread = match self.rdr.fill() { Err(err) => return Err(S::Error::error_io(err)), Ok(didread) => didread, }; if !already_binary { if let Some(offset) = self.rdr.binary_byte_offset() { if !self.core.binary_data(offset)? { return Ok(false); } } } if !didread || self.should_binary_quit() { return Ok(false); } // If rolling the buffer didn't result in consuming anything and if // re-filling the buffer didn't add any bytes, then the only thing in // our buffer is leftover context, which we no longer need since there // is nothing left to search. So forcefully quit. if consumed == 0 && old_buf_len == self.rdr.buffer().len() { self.rdr.consume(old_buf_len); return Ok(false); } Ok(true) } fn should_binary_quit(&self) -> bool { self.rdr.binary_byte_offset().is_some() && self.config.binary.quit_byte().is_some() } } #[derive(Debug)] pub struct SliceByLine<'s, M, S> { config: &'s Config, core: Core<'s, M, S>, slice: &'s [u8], } impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> { pub fn new( searcher: &'s Searcher, matcher: M, slice: &'s [u8], write_to: S, ) -> SliceByLine<'s, M, S> { debug_assert!(!searcher.multi_line_with_matcher(&matcher)); SliceByLine { config: &searcher.config, core: Core::new(searcher, matcher, write_to, true), slice: slice, } } pub fn run(mut self) -> Result<(), S::Error> { if self.core.begin()? { let binary_upto = cmp::min(self.slice.len(), DEFAULT_BUFFER_CAPACITY); let binary_range = Range::new(0, binary_upto); if !self.core.detect_binary(self.slice, &binary_range)? { while !self.slice[self.core.pos()..].is_empty() && self.core.match_by_line(self.slice)? {} } } let byte_count = self.byte_count(); let binary_byte_offset = self.core.binary_byte_offset(); self.core.finish(byte_count, binary_byte_offset) } fn byte_count(&mut self) -> u64 { match self.core.binary_byte_offset() { Some(offset) if offset < self.core.pos() as u64 => offset, _ => self.core.pos() as u64, } } } #[derive(Debug)] pub struct MultiLine<'s, M, S> { config: &'s Config, core: Core<'s, M, S>, slice: &'s [u8], last_match: Option, } impl<'s, M: Matcher, S: Sink> MultiLine<'s, M, S> { pub fn new( searcher: &'s Searcher, matcher: M, slice: &'s [u8], write_to: S, ) -> MultiLine<'s, M, S> { debug_assert!(searcher.multi_line_with_matcher(&matcher)); MultiLine { config: &searcher.config, core: Core::new(searcher, matcher, write_to, true), slice: slice, last_match: None, } } pub fn run(mut self) -> Result<(), S::Error> { if self.core.begin()? { let binary_upto = cmp::min(self.slice.len(), DEFAULT_BUFFER_CAPACITY); let binary_range = Range::new(0, binary_upto); if !self.core.detect_binary(self.slice, &binary_range)? { let mut keepgoing = true; while !self.slice[self.core.pos()..].is_empty() && keepgoing { keepgoing = self.sink()?; } if keepgoing { keepgoing = match self.last_match.take() { None => true, Some(last_match) => { if self.sink_context(&last_match)? { self.sink_matched(&last_match)?; } true } }; } // Take care of any remaining context after the last match. if keepgoing { if self.config.passthru { self.core.other_context_by_line( self.slice, self.slice.len(), )?; } else { self.core.after_context_by_line( self.slice, self.slice.len(), )?; } } } } let byte_count = self.byte_count(); let binary_byte_offset = self.core.binary_byte_offset(); self.core.finish(byte_count, binary_byte_offset) } fn sink(&mut self) -> Result { if self.config.invert_match { return self.sink_matched_inverted(); } let mat = match self.find()? { Some(range) => range, None => { self.core.set_pos(self.slice.len()); return Ok(true); } }; self.advance(&mat); let line = lines::locate(self.slice, self.config.line_term.as_byte(), mat); // We delay sinking the match to make sure we group adjacent matches // together in a single sink. Adjacent matches are distinct matches // that start and end on the same line, respectively. This guarantees // that a single line is never sinked more than once. match self.last_match.take() { None => { self.last_match = Some(line); Ok(true) } Some(last_match) => { // If the lines in the previous match overlap with the lines // in this match, then simply grow the match and move on. This // happens when the next match begins on the same line that the // last match ends on. // // Note that we do not technically require strict overlap here. // Instead, we only require that the lines are adjacent. This // provides larger blocks of lines to the printer, and results // in overall better behavior with respect to how replacements // are handled. // // See: https://github.com/BurntSushi/ripgrep/issues/1311 // And also the associated commit fixing #1311. if last_match.end() >= line.start() { self.last_match = Some(last_match.with_end(line.end())); Ok(true) } else { self.last_match = Some(line); if !self.sink_context(&last_match)? { return Ok(false); } self.sink_matched(&last_match) } } } } fn sink_matched_inverted(&mut self) -> Result { assert!(self.config.invert_match); let invert_match = match self.find()? { None => { let range = Range::new(self.core.pos(), self.slice.len()); self.core.set_pos(range.end()); range } Some(mat) => { let line = lines::locate( self.slice, self.config.line_term.as_byte(), mat, ); let range = Range::new(self.core.pos(), line.start()); self.advance(&line); range } }; if invert_match.is_empty() { return Ok(true); } if !self.sink_context(&invert_match)? { return Ok(false); } let mut stepper = LineStep::new( self.config.line_term.as_byte(), invert_match.start(), invert_match.end(), ); while let Some(line) = stepper.next_match(self.slice) { if !self.sink_matched(&line)? { return Ok(false); } } Ok(true) } fn sink_matched(&mut self, range: &Range) -> Result { if range.is_empty() { // The only way we can produce an empty line for a match is if we // match the position immediately following the last byte that we // search, and where that last byte is also the line terminator. We // never want to report that match, and we know we're done at that // point anyway, so stop the search. return Ok(false); } self.core.matched(self.slice, range) } fn sink_context(&mut self, range: &Range) -> Result { if self.config.passthru { if !self.core.other_context_by_line(self.slice, range.start())? { return Ok(false); } } else { if !self.core.after_context_by_line(self.slice, range.start())? { return Ok(false); } if !self.core.before_context_by_line(self.slice, range.start())? { return Ok(false); } } Ok(true) } fn find(&mut self) -> Result, S::Error> { match self.core.matcher().find(&self.slice[self.core.pos()..]) { Err(err) => Err(S::Error::error_message(err)), Ok(None) => Ok(None), Ok(Some(m)) => Ok(Some(m.offset(self.core.pos()))), } } /// Advance the search position based on the previous match. /// /// If the previous match is zero width, then this advances the search /// position one byte past the end of the match. fn advance(&mut self, range: &Range) { self.core.set_pos(range.end()); if range.is_empty() && self.core.pos() < self.slice.len() { let newpos = self.core.pos() + 1; self.core.set_pos(newpos); } } fn byte_count(&mut self) -> u64 { match self.core.binary_byte_offset() { Some(offset) if offset < self.core.pos() as u64 => offset, _ => self.core.pos() as u64, } } } #[cfg(test)] mod tests { use crate::searcher::{BinaryDetection, SearcherBuilder}; use crate::testutil::{KitchenSink, RegexMatcher, SearcherTester}; use super::*; const SHERLOCK: &'static str = "\ For the Doctor Watsons of this world, as opposed to the Sherlock Holmeses, success in the province of detective work must always be, to a very large extent, the result of luck. Sherlock Holmes can extract a clew from a wisp of straw or a flake of cigar ash; but Doctor Watson has to have it taken out for him and dusted, and exhibited clearly, with a label attached.\ "; const CODE: &'static str = "\ extern crate snap; use std::io; fn main() { let stdin = io::stdin(); let stdout = io::stdout(); // Wrap the stdin reader in a Snappy reader. let mut rdr = snap::Reader::new(stdin.lock()); let mut wtr = stdout.lock(); io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); } "; #[test] fn basic1() { let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 129:be, to a very large extent, the result of luck. Sherlock Holmes byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn basic2() { let exp = "\nbyte count:366\n"; SearcherTester::new(SHERLOCK, "NADA") .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn basic3() { let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65:Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes 193:can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "a") .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn basic4() { let haystack = "\ a b c d "; let byte_count = haystack.len(); let exp = format!("0:a\n\nbyte count:{}\n", byte_count); SearcherTester::new(haystack, "a") .line_number(false) .expected_no_line_number(&exp) .test(); } #[test] fn invert1() { let exp = "\ 65:Holmeses, success in the province of detective work must always 193:can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .line_number(false) .invert_match(true) .expected_no_line_number(exp) .test(); } #[test] fn line_number1() { let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 129:be, to a very large extent, the result of luck. Sherlock Holmes byte count:366 "; let exp_line = "\ 1:0:For the Doctor Watsons of this world, as opposed to the Sherlock 3:129:be, to a very large extent, the result of luck. Sherlock Holmes byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .expected_no_line_number(exp) .expected_with_line_number(exp_line) .test(); } #[test] fn line_number_invert1() { let exp = "\ 65:Holmeses, success in the province of detective work must always 193:can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; let exp_line = "\ 2:65:Holmeses, success in the province of detective work must always 4:193:can extract a clew from a wisp of straw or a flake of cigar ash; 5:258:but Doctor Watson has to have it taken out for him and dusted, 6:321:and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .invert_match(true) .expected_no_line_number(exp) .expected_with_line_number(exp_line) .test(); } #[test] fn multi_line_overlap1() { let haystack = "xxx\nabc\ndefxxxabc\ndefxxx\nxxx"; let byte_count = haystack.len(); let exp = format!( "4:abc\n8:defxxxabc\n18:defxxx\n\nbyte count:{}\n", byte_count ); SearcherTester::new(haystack, "abc\ndef") .by_line(false) .line_number(false) .expected_no_line_number(&exp) .test(); } #[test] fn multi_line_overlap2() { let haystack = "xxx\nabc\ndefabc\ndefxxx\nxxx"; let byte_count = haystack.len(); let exp = format!( "4:abc\n8:defabc\n15:defxxx\n\nbyte count:{}\n", byte_count ); SearcherTester::new(haystack, "abc\ndef") .by_line(false) .line_number(false) .expected_no_line_number(&exp) .test(); } #[test] fn empty_line1() { let exp = "\nbyte count:0\n"; SearcherTester::new("", r"^$") .expected_no_line_number(exp) .expected_with_line_number(exp) .test(); } #[test] fn empty_line2() { let exp = "0:\n\nbyte count:1\n"; let exp_line = "1:0:\n\nbyte count:1\n"; SearcherTester::new("\n", r"^$") .expected_no_line_number(exp) .expected_with_line_number(exp_line) .test(); } #[test] fn empty_line3() { let exp = "0:\n1:\n\nbyte count:2\n"; let exp_line = "1:0:\n2:1:\n\nbyte count:2\n"; SearcherTester::new("\n\n", r"^$") .expected_no_line_number(exp) .expected_with_line_number(exp_line) .test(); } #[test] fn empty_line4() { // See: https://github.com/BurntSushi/ripgrep/issues/441 let haystack = "\ a b c d "; let byte_count = haystack.len(); let exp = format!("4:\n7:\n8:\n\nbyte count:{}\n", byte_count); let exp_line = format!("3:4:\n5:7:\n6:8:\n\nbyte count:{}\n", byte_count); SearcherTester::new(haystack, r"^$") .expected_no_line_number(&exp) .expected_with_line_number(&exp_line) .test(); } #[test] fn empty_line5() { // See: https://github.com/BurntSushi/ripgrep/issues/441 // This is like empty_line4, but lacks the trailing line terminator. let haystack = "\ a b c d"; let byte_count = haystack.len(); let exp = format!("4:\n7:\n8:\n\nbyte count:{}\n", byte_count); let exp_line = format!("3:4:\n5:7:\n6:8:\n\nbyte count:{}\n", byte_count); SearcherTester::new(haystack, r"^$") .expected_no_line_number(&exp) .expected_with_line_number(&exp_line) .test(); } #[test] fn empty_line6() { // See: https://github.com/BurntSushi/ripgrep/issues/441 // This is like empty_line4, but includes an empty line at the end. let haystack = "\ a b c d "; let byte_count = haystack.len(); let exp = format!("4:\n7:\n8:\n11:\n\nbyte count:{}\n", byte_count); let exp_line = format!("3:4:\n5:7:\n6:8:\n8:11:\n\nbyte count:{}\n", byte_count); SearcherTester::new(haystack, r"^$") .expected_no_line_number(&exp) .expected_with_line_number(&exp_line) .test(); } #[test] fn big1() { let mut haystack = String::new(); haystack.push_str("a\n"); // Pick an arbitrary number above the capacity. for _ in 0..(4 * (DEFAULT_BUFFER_CAPACITY + 7)) { haystack.push_str("zzz\n"); } haystack.push_str("a\n"); let byte_count = haystack.len(); let exp = format!("0:a\n1048690:a\n\nbyte count:{}\n", byte_count); SearcherTester::new(&haystack, "a") .line_number(false) .expected_no_line_number(&exp) .test(); } #[test] fn big_error_one_line() { let mut haystack = String::new(); haystack.push_str("a\n"); // Pick an arbitrary number above the capacity. for _ in 0..(4 * (DEFAULT_BUFFER_CAPACITY + 7)) { haystack.push_str("zzz\n"); } haystack.push_str("a\n"); let matcher = RegexMatcher::new("a"); let mut sink = KitchenSink::new(); let mut searcher = SearcherBuilder::new() .heap_limit(Some(3)) // max line length is 4, one byte short .build(); let result = searcher.search_reader(&matcher, haystack.as_bytes(), &mut sink); assert!(result.is_err()); } #[test] fn big_error_multi_line() { let mut haystack = String::new(); haystack.push_str("a\n"); // Pick an arbitrary number above the capacity. for _ in 0..(4 * (DEFAULT_BUFFER_CAPACITY + 7)) { haystack.push_str("zzz\n"); } haystack.push_str("a\n"); let matcher = RegexMatcher::new("a"); let mut sink = KitchenSink::new(); let mut searcher = SearcherBuilder::new() .multi_line(true) .heap_limit(Some(haystack.len())) // actually need one more byte .build(); let result = searcher.search_reader(&matcher, haystack.as_bytes(), &mut sink); assert!(result.is_err()); } #[test] fn binary1() { let haystack = "\x00a"; let exp = "\nbyte count:0\nbinary offset:0\n"; SearcherTester::new(haystack, "a") .binary_detection(BinaryDetection::quit(0)) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn binary2() { let haystack = "a\x00"; let exp = "\nbyte count:0\nbinary offset:1\n"; SearcherTester::new(haystack, "a") .binary_detection(BinaryDetection::quit(0)) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn binary3() { let mut haystack = String::new(); haystack.push_str("a\n"); for _ in 0..DEFAULT_BUFFER_CAPACITY { haystack.push_str("zzz\n"); } haystack.push_str("a\n"); haystack.push_str("zzz\n"); haystack.push_str("a\x00a\n"); haystack.push_str("zzz\n"); haystack.push_str("a\n"); // The line buffered searcher has slightly different semantics here. // Namely, it will *always* detect binary data in the current buffer // before searching it. Thus, the total number of bytes searched is // smaller than below. let exp = "0:a\n\nbyte count:262146\nbinary offset:262153\n"; // In contrast, the slice readers (for multi line as well) will only // look for binary data in the initial chunk of bytes. After that // point, it only looks for binary data in matches. Note though that // the binary offset remains the same. (See the binary4 test for a case // where the offset is explicitly different.) let exp_slice = "0:a\n262146:a\n\nbyte count:262153\nbinary offset:262153\n"; SearcherTester::new(&haystack, "a") .binary_detection(BinaryDetection::quit(0)) .line_number(false) .auto_heap_limit(false) .expected_no_line_number(exp) .expected_slice_no_line_number(exp_slice) .test(); } #[test] fn binary4() { let mut haystack = String::new(); haystack.push_str("a\n"); for _ in 0..DEFAULT_BUFFER_CAPACITY { haystack.push_str("zzz\n"); } haystack.push_str("a\n"); // The Read searcher will detect binary data here, but since this is // beyond the initial buffer size and doesn't otherwise contain a // match, the Slice reader won't detect the binary data until the next // line (which is a match). haystack.push_str("b\x00b\n"); haystack.push_str("a\x00a\n"); haystack.push_str("a\n"); let exp = "0:a\n\nbyte count:262146\nbinary offset:262149\n"; // The binary offset for the Slice readers corresponds to the binary // data in `a\x00a\n` since the first line with binary data // (`b\x00b\n`) isn't part of a match, and is therefore undetected. let exp_slice = "0:a\n262146:a\n\nbyte count:262153\nbinary offset:262153\n"; SearcherTester::new(&haystack, "a") .binary_detection(BinaryDetection::quit(0)) .line_number(false) .auto_heap_limit(false) .expected_no_line_number(exp) .expected_slice_no_line_number(exp_slice) .test(); } #[test] fn passthru_sherlock1() { let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65-Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258-but Doctor Watson has to have it taken out for him and dusted, 321-and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .passthru(true) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn passthru_sherlock_invert1() { let exp = "\ 0-For the Doctor Watsons of this world, as opposed to the Sherlock 65:Holmeses, success in the province of detective work must always 129-be, to a very large extent, the result of luck. Sherlock Holmes 193:can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .passthru(true) .line_number(false) .invert_match(true) .expected_no_line_number(exp) .test(); } #[test] fn context_sherlock1() { let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65-Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; byte count:366 "; let exp_lines = "\ 1:0:For the Doctor Watsons of this world, as opposed to the Sherlock 2-65-Holmeses, success in the province of detective work must always 3:129:be, to a very large extent, the result of luck. Sherlock Holmes 4-193-can extract a clew from a wisp of straw or a flake of cigar ash; byte count:366 "; // before and after + line numbers SearcherTester::new(SHERLOCK, "Sherlock") .after_context(1) .before_context(1) .line_number(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // after SearcherTester::new(SHERLOCK, "Sherlock") .after_context(1) .line_number(false) .expected_no_line_number(exp) .test(); // before let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65-Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .before_context(1) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn context_sherlock_invert1() { let exp = "\ 0-For the Doctor Watsons of this world, as opposed to the Sherlock 65:Holmeses, success in the province of detective work must always 129-be, to a very large extent, the result of luck. Sherlock Holmes 193:can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; let exp_lines = "\ 1-0-For the Doctor Watsons of this world, as opposed to the Sherlock 2:65:Holmeses, success in the province of detective work must always 3-129-be, to a very large extent, the result of luck. Sherlock Holmes 4:193:can extract a clew from a wisp of straw or a flake of cigar ash; 5:258:but Doctor Watson has to have it taken out for him and dusted, 6:321:and exhibited clearly, with a label attached. byte count:366 "; // before and after + line numbers SearcherTester::new(SHERLOCK, "Sherlock") .after_context(1) .before_context(1) .line_number(true) .invert_match(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // before SearcherTester::new(SHERLOCK, "Sherlock") .before_context(1) .line_number(false) .invert_match(true) .expected_no_line_number(exp) .test(); // after let exp = "\ 65:Holmeses, success in the province of detective work must always 129-be, to a very large extent, the result of luck. Sherlock Holmes 193:can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .after_context(1) .line_number(false) .invert_match(true) .expected_no_line_number(exp) .test(); } #[test] fn context_sherlock2() { let exp = "\ 65-Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes 193:can extract a clew from a wisp of straw or a flake of cigar ash; 258-but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; let exp_lines = "\ 2-65-Holmeses, success in the province of detective work must always 3:129:be, to a very large extent, the result of luck. Sherlock Holmes 4:193:can extract a clew from a wisp of straw or a flake of cigar ash; 5-258-but Doctor Watson has to have it taken out for him and dusted, 6:321:and exhibited clearly, with a label attached. byte count:366 "; // before + after + line numbers SearcherTester::new(SHERLOCK, " a ") .after_context(1) .before_context(1) .line_number(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // before SearcherTester::new(SHERLOCK, " a ") .before_context(1) .line_number(false) .expected_no_line_number(exp) .test(); // after let exp = "\ 129:be, to a very large extent, the result of luck. Sherlock Holmes 193:can extract a clew from a wisp of straw or a flake of cigar ash; 258-but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, " a ") .after_context(1) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn context_sherlock_invert2() { let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65:Holmeses, success in the province of detective work must always 129-be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, 321-and exhibited clearly, with a label attached. byte count:366 "; let exp_lines = "\ 1:0:For the Doctor Watsons of this world, as opposed to the Sherlock 2:65:Holmeses, success in the province of detective work must always 3-129-be, to a very large extent, the result of luck. Sherlock Holmes 4-193-can extract a clew from a wisp of straw or a flake of cigar ash; 5:258:but Doctor Watson has to have it taken out for him and dusted, 6-321-and exhibited clearly, with a label attached. byte count:366 "; // before + after + line numbers SearcherTester::new(SHERLOCK, " a ") .after_context(1) .before_context(1) .line_number(true) .invert_match(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // before let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65:Holmeses, success in the province of detective work must always -- 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, byte count:366 "; SearcherTester::new(SHERLOCK, " a ") .before_context(1) .line_number(false) .invert_match(true) .expected_no_line_number(exp) .test(); // after let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65:Holmeses, success in the province of detective work must always 129-be, to a very large extent, the result of luck. Sherlock Holmes -- 258:but Doctor Watson has to have it taken out for him and dusted, 321-and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, " a ") .after_context(1) .line_number(false) .invert_match(true) .expected_no_line_number(exp) .test(); } #[test] fn context_sherlock3() { let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65-Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258-but Doctor Watson has to have it taken out for him and dusted, byte count:366 "; let exp_lines = "\ 1:0:For the Doctor Watsons of this world, as opposed to the Sherlock 2-65-Holmeses, success in the province of detective work must always 3:129:be, to a very large extent, the result of luck. Sherlock Holmes 4-193-can extract a clew from a wisp of straw or a flake of cigar ash; 5-258-but Doctor Watson has to have it taken out for him and dusted, byte count:366 "; // before and after + line numbers SearcherTester::new(SHERLOCK, "Sherlock") .after_context(2) .before_context(2) .line_number(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // after SearcherTester::new(SHERLOCK, "Sherlock") .after_context(2) .line_number(false) .expected_no_line_number(exp) .test(); // before let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65-Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .before_context(2) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn context_sherlock4() { let exp = "\ 129-be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, 321-and exhibited clearly, with a label attached. byte count:366 "; let exp_lines = "\ 3-129-be, to a very large extent, the result of luck. Sherlock Holmes 4-193-can extract a clew from a wisp of straw or a flake of cigar ash; 5:258:but Doctor Watson has to have it taken out for him and dusted, 6-321-and exhibited clearly, with a label attached. byte count:366 "; // before and after + line numbers SearcherTester::new(SHERLOCK, "dusted") .after_context(2) .before_context(2) .line_number(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // after let exp = "\ 258:but Doctor Watson has to have it taken out for him and dusted, 321-and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "dusted") .after_context(2) .line_number(false) .expected_no_line_number(exp) .test(); // before let exp = "\ 129-be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258:but Doctor Watson has to have it taken out for him and dusted, byte count:366 "; SearcherTester::new(SHERLOCK, "dusted") .before_context(2) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn context_sherlock5() { let exp = "\ 0-For the Doctor Watsons of this world, as opposed to the Sherlock 65:Holmeses, success in the province of detective work must always 129-be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258-but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; let exp_lines = "\ 1-0-For the Doctor Watsons of this world, as opposed to the Sherlock 2:65:Holmeses, success in the province of detective work must always 3-129-be, to a very large extent, the result of luck. Sherlock Holmes 4-193-can extract a clew from a wisp of straw or a flake of cigar ash; 5-258-but Doctor Watson has to have it taken out for him and dusted, 6:321:and exhibited clearly, with a label attached. byte count:366 "; // before and after + line numbers SearcherTester::new(SHERLOCK, "success|attached") .after_context(2) .before_context(2) .line_number(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // after let exp = "\ 65:Holmeses, success in the province of detective work must always 129-be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; -- 321:and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "success|attached") .after_context(2) .line_number(false) .expected_no_line_number(exp) .test(); // before let exp = "\ 0-For the Doctor Watsons of this world, as opposed to the Sherlock 65:Holmeses, success in the province of detective work must always -- 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258-but Doctor Watson has to have it taken out for him and dusted, 321:and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "success|attached") .before_context(2) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn context_sherlock6() { let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65-Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258-but Doctor Watson has to have it taken out for him and dusted, 321-and exhibited clearly, with a label attached. byte count:366 "; let exp_lines = "\ 1:0:For the Doctor Watsons of this world, as opposed to the Sherlock 2-65-Holmeses, success in the province of detective work must always 3:129:be, to a very large extent, the result of luck. Sherlock Holmes 4-193-can extract a clew from a wisp of straw or a flake of cigar ash; 5-258-but Doctor Watson has to have it taken out for him and dusted, 6-321-and exhibited clearly, with a label attached. byte count:366 "; // before and after + line numbers SearcherTester::new(SHERLOCK, "Sherlock") .after_context(3) .before_context(3) .line_number(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // after let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65-Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes 193-can extract a clew from a wisp of straw or a flake of cigar ash; 258-but Doctor Watson has to have it taken out for him and dusted, 321-and exhibited clearly, with a label attached. byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .after_context(3) .line_number(false) .expected_no_line_number(exp) .test(); // before let exp = "\ 0:For the Doctor Watsons of this world, as opposed to the Sherlock 65-Holmeses, success in the province of detective work must always 129:be, to a very large extent, the result of luck. Sherlock Holmes byte count:366 "; SearcherTester::new(SHERLOCK, "Sherlock") .before_context(3) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn context_code1() { // before and after let exp = "\ 33- 34-fn main() { 46: let stdin = io::stdin(); 75- let stdout = io::stdout(); 106- 107: // Wrap the stdin reader in a Snappy reader. 156: let mut rdr = snap::Reader::new(stdin.lock()); 207- let mut wtr = stdout.lock(); 240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); byte count:307 "; let exp_lines = "\ 4-33- 5-34-fn main() { 6:46: let stdin = io::stdin(); 7-75- let stdout = io::stdout(); 8-106- 9:107: // Wrap the stdin reader in a Snappy reader. 10:156: let mut rdr = snap::Reader::new(stdin.lock()); 11-207- let mut wtr = stdout.lock(); 12-240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); byte count:307 "; // before and after + line numbers SearcherTester::new(CODE, "stdin") .after_context(2) .before_context(2) .line_number(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // after let exp = "\ 46: let stdin = io::stdin(); 75- let stdout = io::stdout(); 106- 107: // Wrap the stdin reader in a Snappy reader. 156: let mut rdr = snap::Reader::new(stdin.lock()); 207- let mut wtr = stdout.lock(); 240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); byte count:307 "; SearcherTester::new(CODE, "stdin") .after_context(2) .line_number(false) .expected_no_line_number(exp) .test(); // before let exp = "\ 33- 34-fn main() { 46: let stdin = io::stdin(); 75- let stdout = io::stdout(); 106- 107: // Wrap the stdin reader in a Snappy reader. 156: let mut rdr = snap::Reader::new(stdin.lock()); byte count:307 "; SearcherTester::new(CODE, "stdin") .before_context(2) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn context_code2() { let exp = "\ 34-fn main() { 46- let stdin = io::stdin(); 75: let stdout = io::stdout(); 106- 107- // Wrap the stdin reader in a Snappy reader. 156- let mut rdr = snap::Reader::new(stdin.lock()); 207: let mut wtr = stdout.lock(); 240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); 305-} byte count:307 "; let exp_lines = "\ 5-34-fn main() { 6-46- let stdin = io::stdin(); 7:75: let stdout = io::stdout(); 8-106- 9-107- // Wrap the stdin reader in a Snappy reader. 10-156- let mut rdr = snap::Reader::new(stdin.lock()); 11:207: let mut wtr = stdout.lock(); 12-240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); 13-305-} byte count:307 "; // before and after + line numbers SearcherTester::new(CODE, "stdout") .after_context(2) .before_context(2) .line_number(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // after let exp = "\ 75: let stdout = io::stdout(); 106- 107- // Wrap the stdin reader in a Snappy reader. -- 207: let mut wtr = stdout.lock(); 240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); 305-} byte count:307 "; SearcherTester::new(CODE, "stdout") .after_context(2) .line_number(false) .expected_no_line_number(exp) .test(); // before let exp = "\ 34-fn main() { 46- let stdin = io::stdin(); 75: let stdout = io::stdout(); -- 107- // Wrap the stdin reader in a Snappy reader. 156- let mut rdr = snap::Reader::new(stdin.lock()); 207: let mut wtr = stdout.lock(); byte count:307 "; SearcherTester::new(CODE, "stdout") .before_context(2) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn context_code3() { let exp = "\ 20-use std::io; 33- 34:fn main() { 46- let stdin = io::stdin(); 75- let stdout = io::stdout(); 106- 107- // Wrap the stdin reader in a Snappy reader. 156: let mut rdr = snap::Reader::new(stdin.lock()); 207- let mut wtr = stdout.lock(); 240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); byte count:307 "; let exp_lines = "\ 3-20-use std::io; 4-33- 5:34:fn main() { 6-46- let stdin = io::stdin(); 7-75- let stdout = io::stdout(); 8-106- 9-107- // Wrap the stdin reader in a Snappy reader. 10:156: let mut rdr = snap::Reader::new(stdin.lock()); 11-207- let mut wtr = stdout.lock(); 12-240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); byte count:307 "; // before and after + line numbers SearcherTester::new(CODE, "fn main|let mut rdr") .after_context(2) .before_context(2) .line_number(true) .expected_no_line_number(exp) .expected_with_line_number(exp_lines) .test(); // after let exp = "\ 34:fn main() { 46- let stdin = io::stdin(); 75- let stdout = io::stdout(); -- 156: let mut rdr = snap::Reader::new(stdin.lock()); 207- let mut wtr = stdout.lock(); 240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); byte count:307 "; SearcherTester::new(CODE, "fn main|let mut rdr") .after_context(2) .line_number(false) .expected_no_line_number(exp) .test(); // before let exp = "\ 20-use std::io; 33- 34:fn main() { -- 106- 107- // Wrap the stdin reader in a Snappy reader. 156: let mut rdr = snap::Reader::new(stdin.lock()); byte count:307 "; SearcherTester::new(CODE, "fn main|let mut rdr") .before_context(2) .line_number(false) .expected_no_line_number(exp) .test(); } #[test] fn scratch() { use crate::sinks; use crate::testutil::RegexMatcher; const SHERLOCK: &'static [u8] = b"\ For the Doctor Wat\xFFsons of this world, as opposed to the Sherlock Holmeses, success in the province of detective work must always be, to a very large extent, the result of luck. Sherlock Holmes can extract a clew from a wisp of straw or a flake of cigar ash; but Doctor Watson has to have it taken out for him and dusted, and exhibited clearly, with a label attached.\ "; let haystack = SHERLOCK; let matcher = RegexMatcher::new("Sherlock"); let mut searcher = SearcherBuilder::new().line_number(true).build(); searcher .search_reader( &matcher, haystack, sinks::Lossy(|n, line| { print!("{}:{}", n, line); Ok(true) }), ) .unwrap(); } } grep-searcher-0.1.8/src/searcher/mmap.rs000064400000000000000000000071650072674642500163050ustar 00000000000000use std::fs::File; use std::path::Path; use memmap::Mmap; /// Controls the strategy used for determining when to use memory maps. /// /// If a searcher is called in circumstances where it is possible to use memory /// maps, and memory maps are enabled, then it will attempt to do so if it /// believes it will make the search faster. /// /// By default, memory maps are disabled. #[derive(Clone, Debug)] pub struct MmapChoice(MmapChoiceImpl); #[derive(Clone, Debug)] enum MmapChoiceImpl { Auto, Never, } impl Default for MmapChoice { fn default() -> MmapChoice { MmapChoice(MmapChoiceImpl::Never) } } impl MmapChoice { /// Use memory maps when they are believed to be advantageous. /// /// The heuristics used to determine whether to use a memory map or not /// may depend on many things, including but not limited to, file size /// and platform. /// /// If memory maps are unavailable or cannot be used for a specific input, /// then normal OS read calls are used instead. /// /// # Safety /// /// This constructor is not safe because there is no obvious way to /// encapsulate the safety of file backed memory maps on all platforms /// without simultaneously negating some or all of their benefits. /// /// The specific contract the caller is required to uphold isn't precise, /// but it basically amounts to something like, "the caller guarantees that /// the underlying file won't be mutated." This, of course, isn't feasible /// in many environments. However, command line tools may still decide to /// take the risk of, say, a `SIGBUS` occurring while attempting to read a /// memory map. pub unsafe fn auto() -> MmapChoice { MmapChoice(MmapChoiceImpl::Auto) } /// Never use memory maps, no matter what. This is the default. pub fn never() -> MmapChoice { MmapChoice(MmapChoiceImpl::Never) } /// Return a memory map if memory maps are enabled and if creating a /// memory from the given file succeeded and if memory maps are believed /// to be advantageous for performance. /// /// If this does attempt to open a memory map and it fails, then `None` /// is returned and the corresponding error (along with the file path, if /// present) is logged at the debug level. pub(crate) fn open( &self, file: &File, path: Option<&Path>, ) -> Option { if !self.is_enabled() { return None; } if cfg!(target_os = "macos") { // I guess memory maps on macOS aren't great. Should re-evaluate. return None; } // SAFETY: This is acceptable because the only way `MmapChoiceImpl` can // be `Auto` is if the caller invoked the `auto` constructor, which // is itself not safe. Thus, this is a propagation of the caller's // assertion that using memory maps is safe. match unsafe { Mmap::map(file) } { Ok(mmap) => Some(mmap), Err(err) => { if let Some(path) = path { log::debug!( "{}: failed to open memory map: {}", path.display(), err ); } else { log::debug!("failed to open memory map: {}", err); } None } } } /// Whether this strategy may employ memory maps or not. pub(crate) fn is_enabled(&self) -> bool { match self.0 { MmapChoiceImpl::Auto => true, MmapChoiceImpl::Never => false, } } } grep-searcher-0.1.8/src/searcher/mod.rs000064400000000000000000001151040072674642500161230ustar 00000000000000use std::cell::RefCell; use std::cmp; use std::fmt; use std::fs::File; use std::io::{self, Read}; use std::path::Path; use crate::line_buffer::{ self, alloc_error, BufferAllocation, LineBuffer, LineBufferBuilder, LineBufferReader, DEFAULT_BUFFER_CAPACITY, }; use crate::searcher::glue::{MultiLine, ReadByLine, SliceByLine}; use crate::sink::{Sink, SinkError}; use encoding_rs; use encoding_rs_io::DecodeReaderBytesBuilder; use grep_matcher::{LineTerminator, Match, Matcher}; pub use self::mmap::MmapChoice; mod core; mod glue; mod mmap; /// We use this type alias since we want the ergonomics of a matcher's `Match` /// type, but in practice, we use it for arbitrary ranges, so give it a more /// accurate name. This is only used in the searcher's internals. type Range = Match; /// The behavior of binary detection while searching. /// /// Binary detection is the process of _heuristically_ identifying whether a /// given chunk of data is binary or not, and then taking an action based on /// the result of that heuristic. The motivation behind detecting binary data /// is that binary data often indicates data that is undesirable to search /// using textual patterns. Of course, there are many cases in which this isn't /// true, which is why binary detection is disabled by default. /// /// Unfortunately, binary detection works differently depending on the type of /// search being executed: /// /// 1. When performing a search using a fixed size buffer, binary detection is /// applied to the buffer's contents as it is filled. Binary detection must /// be applied to the buffer directly because binary files may not contain /// line terminators, which could result in exorbitant memory usage. /// 2. When performing a search using memory maps or by reading data off the /// heap, then binary detection is only guaranteed to be applied to the /// parts corresponding to a match. When `Quit` is enabled, then the first /// few KB of the data are searched for binary data. #[derive(Clone, Debug, Default)] pub struct BinaryDetection(line_buffer::BinaryDetection); impl BinaryDetection { /// No binary detection is performed. Data reported by the searcher may /// contain arbitrary bytes. /// /// This is the default. pub fn none() -> BinaryDetection { BinaryDetection(line_buffer::BinaryDetection::None) } /// Binary detection is performed by looking for the given byte. /// /// When searching is performed using a fixed size buffer, then the /// contents of that buffer are always searched for the presence of this /// byte. If it is found, then the underlying data is considered binary /// and the search stops as if it reached EOF. /// /// When searching is performed with the entire contents mapped into /// memory, then binary detection is more conservative. Namely, only a /// fixed sized region at the beginning of the contents are detected for /// binary data. As a compromise, any subsequent matching (or context) /// lines are also searched for binary data. If binary data is detected at /// any point, then the search stops as if it reached EOF. pub fn quit(binary_byte: u8) -> BinaryDetection { BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte)) } /// Binary detection is performed by looking for the given byte, and /// replacing it with the line terminator configured on the searcher. /// (If the searcher is configured to use `CRLF` as the line terminator, /// then this byte is replaced by just `LF`.) /// /// When searching is performed using a fixed size buffer, then the /// contents of that buffer are always searched for the presence of this /// byte and replaced with the line terminator. In effect, the caller is /// guaranteed to never observe this byte while searching. /// /// When searching is performed with the entire contents mapped into /// memory, then this setting has no effect and is ignored. pub fn convert(binary_byte: u8) -> BinaryDetection { BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte)) } /// If this binary detection uses the "quit" strategy, then this returns /// the byte that will cause a search to quit. In any other case, this /// returns `None`. pub fn quit_byte(&self) -> Option { match self.0 { line_buffer::BinaryDetection::Quit(b) => Some(b), _ => None, } } /// If this binary detection uses the "convert" strategy, then this returns /// the byte that will be replaced by the line terminator. In any other /// case, this returns `None`. pub fn convert_byte(&self) -> Option { match self.0 { line_buffer::BinaryDetection::Convert(b) => Some(b), _ => None, } } } /// An encoding to use when searching. /// /// An encoding can be used to configure a /// [`SearcherBuilder`](struct.SearchBuilder.html) /// to transcode source data from an encoding to UTF-8 before searching. /// /// An `Encoding` will always be cheap to clone. #[derive(Clone, Debug)] pub struct Encoding(&'static encoding_rs::Encoding); impl Encoding { /// Create a new encoding for the specified label. /// /// The encoding label provided is mapped to an encoding via the set of /// available choices specified in the /// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get). /// If the given label does not correspond to a valid encoding, then this /// returns an error. pub fn new(label: &str) -> Result { let label = label.as_bytes(); match encoding_rs::Encoding::for_label_no_replacement(label) { Some(encoding) => Ok(Encoding(encoding)), None => { Err(ConfigError::UnknownEncoding { label: label.to_vec() }) } } } } /// The internal configuration of a searcher. This is shared among several /// search related types, but is only ever written to by the SearcherBuilder. #[derive(Clone, Debug)] pub struct Config { /// The line terminator to use. line_term: LineTerminator, /// Whether to invert matching. invert_match: bool, /// The number of lines after a match to include. after_context: usize, /// The number of lines before a match to include. before_context: usize, /// Whether to enable unbounded context or not. passthru: bool, /// Whether to count line numbers. line_number: bool, /// The maximum amount of heap memory to use. /// /// When not given, no explicit limit is enforced. When set to `0`, then /// only the memory map search strategy is available. heap_limit: Option, /// The memory map strategy. mmap: MmapChoice, /// The binary data detection strategy. binary: BinaryDetection, /// Whether to enable matching across multiple lines. multi_line: bool, /// An encoding that, when present, causes the searcher to transcode all /// input from the encoding to UTF-8. encoding: Option, /// Whether to do automatic transcoding based on a BOM or not. bom_sniffing: bool, } impl Default for Config { fn default() -> Config { Config { line_term: LineTerminator::default(), invert_match: false, after_context: 0, before_context: 0, passthru: false, line_number: true, heap_limit: None, mmap: MmapChoice::default(), binary: BinaryDetection::default(), multi_line: false, encoding: None, bom_sniffing: true, } } } impl Config { /// Return the maximal amount of lines needed to fulfill this /// configuration's context. /// /// If this returns `0`, then no context is ever needed. fn max_context(&self) -> usize { cmp::max(self.before_context, self.after_context) } /// Build a line buffer from this configuration. fn line_buffer(&self) -> LineBuffer { let mut builder = LineBufferBuilder::new(); builder .line_terminator(self.line_term.as_byte()) .binary_detection(self.binary.0); if let Some(limit) = self.heap_limit { let (capacity, additional) = if limit <= DEFAULT_BUFFER_CAPACITY { (limit, 0) } else { (DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY) }; builder .capacity(capacity) .buffer_alloc(BufferAllocation::Error(additional)); } builder.build() } } /// An error that can occur when building a searcher. /// /// This error occurs when a non-sensical configuration is present when trying /// to construct a `Searcher` from a `SearcherBuilder`. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ConfigError { /// Indicates that the heap limit configuration prevents all possible /// search strategies from being used. For example, if the heap limit is /// set to 0 and memory map searching is disabled or unavailable. SearchUnavailable, /// Occurs when a matcher reports a line terminator that is different than /// the one configured in the searcher. MismatchedLineTerminators { /// The matcher's line terminator. matcher: LineTerminator, /// The searcher's line terminator. searcher: LineTerminator, }, /// Occurs when no encoding could be found for a particular label. UnknownEncoding { /// The provided encoding label that could not be found. label: Vec, }, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } impl ::std::error::Error for ConfigError { fn description(&self) -> &str { "grep-searcher configuration error" } } impl fmt::Display for ConfigError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { ConfigError::SearchUnavailable => { write!(f, "grep config error: no available searchers") } ConfigError::MismatchedLineTerminators { matcher, searcher } => { write!( f, "grep config error: mismatched line terminators, \ matcher has {:?} but searcher has {:?}", matcher, searcher ) } ConfigError::UnknownEncoding { ref label } => write!( f, "grep config error: unknown encoding: {}", String::from_utf8_lossy(label), ), _ => panic!("BUG: unexpected variant found"), } } } /// A builder for configuring a searcher. /// /// A search builder permits specifying the configuration of a searcher, /// including options like whether to invert the search or to enable multi /// line search. /// /// Once a searcher has been built, it is beneficial to reuse that searcher /// for multiple searches, if possible. #[derive(Clone, Debug)] pub struct SearcherBuilder { config: Config, } impl Default for SearcherBuilder { fn default() -> SearcherBuilder { SearcherBuilder::new() } } impl SearcherBuilder { /// Create a new searcher builder with a default configuration. pub fn new() -> SearcherBuilder { SearcherBuilder { config: Config::default() } } /// Build a searcher with the given matcher. pub fn build(&self) -> Searcher { let mut config = self.config.clone(); if config.passthru { config.before_context = 0; config.after_context = 0; } let mut decode_builder = DecodeReaderBytesBuilder::new(); decode_builder .encoding(self.config.encoding.as_ref().map(|e| e.0)) .utf8_passthru(true) .strip_bom(self.config.bom_sniffing) .bom_override(true) .bom_sniffing(self.config.bom_sniffing); Searcher { config: config, decode_builder: decode_builder, decode_buffer: RefCell::new(vec![0; 8 * (1 << 10)]), line_buffer: RefCell::new(self.config.line_buffer()), multi_line_buffer: RefCell::new(vec![]), } } /// Set the line terminator that is used by the searcher. /// /// When using a searcher, if the matcher provided has a line terminator /// set, then it must be the same as this one. If they aren't, building /// a searcher will return an error. /// /// By default, this is set to `b'\n'`. pub fn line_terminator( &mut self, line_term: LineTerminator, ) -> &mut SearcherBuilder { self.config.line_term = line_term; self } /// Whether to invert matching, whereby lines that don't match are reported /// instead of reporting lines that do match. /// /// By default, this is disabled. pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder { self.config.invert_match = yes; self } /// Whether to count and include line numbers with matching lines. /// /// This is enabled by default. There is a small performance penalty /// associated with computing line numbers, so this can be disabled when /// this isn't desirable. pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder { self.config.line_number = yes; self } /// Whether to enable multi line search or not. /// /// When multi line search is enabled, matches *may* match across multiple /// lines. Conversely, when multi line search is disabled, it is impossible /// for any match to span more than one line. /// /// **Warning:** multi line search requires having the entire contents to /// search mapped in memory at once. When searching files, memory maps /// will be used if possible and if they are enabled, which avoids using /// your program's heap. However, if memory maps cannot be used (e.g., /// for searching streams like `stdin` or if transcoding is necessary), /// then the entire contents of the stream are read on to the heap before /// starting the search. /// /// This is disabled by default. pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder { self.config.multi_line = yes; self } /// Whether to include a fixed number of lines after every match. /// /// When this is set to a non-zero number, then the searcher will report /// `line_count` contextual lines after every match. /// /// This is set to `0` by default. pub fn after_context( &mut self, line_count: usize, ) -> &mut SearcherBuilder { self.config.after_context = line_count; self } /// Whether to include a fixed number of lines before every match. /// /// When this is set to a non-zero number, then the searcher will report /// `line_count` contextual lines before every match. /// /// This is set to `0` by default. pub fn before_context( &mut self, line_count: usize, ) -> &mut SearcherBuilder { self.config.before_context = line_count; self } /// Whether to enable the "passthru" feature or not. /// /// When passthru is enabled, it effectively treats all non-matching lines /// as contextual lines. In other words, enabling this is akin to /// requesting an unbounded number of before and after contextual lines. /// /// When passthru mode is enabled, any `before_context` or `after_context` /// settings are ignored by setting them to `0`. /// /// This is disabled by default. pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder { self.config.passthru = yes; self } /// Set an approximate limit on the amount of heap space used by a /// searcher. /// /// The heap limit is enforced in two scenarios: /// /// * When searching using a fixed size buffer, the heap limit controls /// how big this buffer is allowed to be. Assuming contexts are disabled, /// the minimum size of this buffer is the length (in bytes) of the /// largest single line in the contents being searched. If any line /// exceeds the heap limit, then an error will be returned. /// * When performing a multi line search, a fixed size buffer cannot be /// used. Thus, the only choices are to read the entire contents on to /// the heap, or use memory maps. In the former case, the heap limit set /// here is enforced. /// /// If a heap limit is set to `0`, then no heap space is used. If there are /// no alternative strategies available for searching without heap space /// (e.g., memory maps are disabled), then the searcher wil return an error /// immediately. /// /// By default, no limit is set. pub fn heap_limit( &mut self, bytes: Option, ) -> &mut SearcherBuilder { self.config.heap_limit = bytes; self } /// Set the strategy to employ use of memory maps. /// /// Currently, there are only two strategies that can be employed: /// /// * **Automatic** - A searcher will use heuristics, including but not /// limited to file size and platform, to determine whether to use memory /// maps or not. /// * **Never** - Memory maps will never be used. If multi line search is /// enabled, then the entire contents will be read on to the heap before /// searching begins. /// /// The default behavior is **never**. Generally speaking, and perhaps /// against conventional wisdom, memory maps don't necessarily enable /// faster searching. For example, depending on the platform, using memory /// maps while searching a large directory can actually be quite a bit /// slower than using normal read calls because of the overhead of managing /// the memory maps. /// /// Memory maps can be faster in some cases however. On some platforms, /// when searching a very large file that *is already in memory*, it can /// be slightly faster to search it as a memory map instead of using /// normal read calls. /// /// Finally, memory maps have a somewhat complicated safety story in Rust. /// If you aren't sure whether enabling memory maps is worth it, then just /// don't bother with it. /// /// **WARNING**: If your process is searching a file backed memory map /// at the same time that file is truncated, then it's possible for the /// process to terminate with a bus error. pub fn memory_map( &mut self, strategy: MmapChoice, ) -> &mut SearcherBuilder { self.config.mmap = strategy; self } /// Set the binary detection strategy. /// /// The binary detection strategy determines not only how the searcher /// detects binary data, but how it responds to the presence of binary /// data. See the [`BinaryDetection`](struct.BinaryDetection.html) type /// for more information. /// /// By default, binary detection is disabled. pub fn binary_detection( &mut self, detection: BinaryDetection, ) -> &mut SearcherBuilder { self.config.binary = detection; self } /// Set the encoding used to read the source data before searching. /// /// When an encoding is provided, then the source data is _unconditionally_ /// transcoded using the encoding, unless a BOM is present. If a BOM is /// present, then the encoding indicated by the BOM is used instead. If the /// transcoding process encounters an error, then bytes are replaced with /// the Unicode replacement codepoint. /// /// When no encoding is specified (the default), then BOM sniffing is /// used (if it's enabled, which it is, by default) to determine whether /// the source data is UTF-8 or UTF-16, and transcoding will be performed /// automatically. If no BOM could be found, then the source data is /// searched _as if_ it were UTF-8. However, so long as the source data is /// at least ASCII compatible, then it is possible for a search to produce /// useful results. pub fn encoding( &mut self, encoding: Option, ) -> &mut SearcherBuilder { self.config.encoding = encoding; self } /// Enable automatic transcoding based on BOM sniffing. /// /// When this is enabled and an explicit encoding is not set, then this /// searcher will try to detect the encoding of the bytes being searched /// by sniffing its byte-order mark (BOM). In particular, when this is /// enabled, UTF-16 encoded files will be searched seamlessly. /// /// When this is disabled and if an explicit encoding is not set, then /// the bytes from the source stream will be passed through unchanged, /// including its BOM, if one is present. /// /// This is enabled by default. pub fn bom_sniffing(&mut self, yes: bool) -> &mut SearcherBuilder { self.config.bom_sniffing = yes; self } } /// A searcher executes searches over a haystack and writes results to a caller /// provided sink. /// /// Matches are detected via implementations of the `Matcher` trait, which must /// be provided by the caller when executing a search. /// /// When possible, a searcher should be reused. #[derive(Clone, Debug)] pub struct Searcher { /// The configuration for this searcher. /// /// We make most of these settings available to users of `Searcher` via /// public API methods, which can be queried in implementations of `Sink` /// if necessary. config: Config, /// A builder for constructing a streaming reader that transcodes source /// data according to either an explicitly specified encoding or via an /// automatically detected encoding via BOM sniffing. /// /// When no transcoding is needed, then the transcoder built will pass /// through the underlying bytes with no additional overhead. decode_builder: DecodeReaderBytesBuilder, /// A buffer that is used for transcoding scratch space. decode_buffer: RefCell>, /// A line buffer for use in line oriented searching. /// /// We wrap it in a RefCell to permit lending out borrows of `Searcher` /// to sinks. We still require a mutable borrow to execute a search, so /// we statically prevent callers from causing RefCell to panic at runtime /// due to a borrowing violation. line_buffer: RefCell, /// A buffer in which to store the contents of a reader when performing a /// multi line search. In particular, multi line searches cannot be /// performed incrementally, and need the entire haystack in memory at /// once. multi_line_buffer: RefCell>, } impl Searcher { /// Create a new searcher with a default configuration. /// /// To configure the searcher (e.g., invert matching, enable memory maps, /// enable contexts, etc.), use the /// [`SearcherBuilder`](struct.SearcherBuilder.html). pub fn new() -> Searcher { SearcherBuilder::new().build() } /// Execute a search over the file with the given path and write the /// results to the given sink. /// /// If memory maps are enabled and the searcher heuristically believes /// memory maps will help the search run faster, then this will use /// memory maps. For this reason, callers should prefer using this method /// or `search_file` over the more generic `search_reader` when possible. pub fn search_path( &mut self, matcher: M, path: P, write_to: S, ) -> Result<(), S::Error> where P: AsRef, M: Matcher, S: Sink, { let path = path.as_ref(); let file = File::open(path).map_err(S::Error::error_io)?; self.search_file_maybe_path(matcher, Some(path), &file, write_to) } /// Execute a search over a file and write the results to the given sink. /// /// If memory maps are enabled and the searcher heuristically believes /// memory maps will help the search run faster, then this will use /// memory maps. For this reason, callers should prefer using this method /// or `search_path` over the more generic `search_reader` when possible. pub fn search_file( &mut self, matcher: M, file: &File, write_to: S, ) -> Result<(), S::Error> where M: Matcher, S: Sink, { self.search_file_maybe_path(matcher, None, file, write_to) } fn search_file_maybe_path( &mut self, matcher: M, path: Option<&Path>, file: &File, write_to: S, ) -> Result<(), S::Error> where M: Matcher, S: Sink, { if let Some(mmap) = self.config.mmap.open(file, path) { log::trace!("{:?}: searching via memory map", path); return self.search_slice(matcher, &mmap, write_to); } // Fast path for multi-line searches of files when memory maps are // not enabled. This pre-allocates a buffer roughly the size of the // file, which isn't possible when searching an arbitrary io::Read. if self.multi_line_with_matcher(&matcher) { log::trace!( "{:?}: reading entire file on to heap for mulitline", path ); self.fill_multi_line_buffer_from_file::(file)?; log::trace!("{:?}: searching via multiline strategy", path); MultiLine::new( self, matcher, &*self.multi_line_buffer.borrow(), write_to, ) .run() } else { log::trace!("{:?}: searching using generic reader", path); self.search_reader(matcher, file, write_to) } } /// Execute a search over any implementation of `io::Read` and write the /// results to the given sink. /// /// When possible, this implementation will search the reader incrementally /// without reading it into memory. In some cases---for example, if multi /// line search is enabled---an incremental search isn't possible and the /// given reader is consumed completely and placed on the heap before /// searching begins. For this reason, when multi line search is enabled, /// one should try to use higher level APIs (e.g., searching by file or /// file path) so that memory maps can be used if they are available and /// enabled. pub fn search_reader( &mut self, matcher: M, read_from: R, write_to: S, ) -> Result<(), S::Error> where M: Matcher, R: io::Read, S: Sink, { self.check_config(&matcher).map_err(S::Error::error_config)?; let mut decode_buffer = self.decode_buffer.borrow_mut(); let decoder = self .decode_builder .build_with_buffer(read_from, &mut *decode_buffer) .map_err(S::Error::error_io)?; if self.multi_line_with_matcher(&matcher) { log::trace!( "generic reader: reading everything to heap for multiline" ); self.fill_multi_line_buffer_from_reader::<_, S>(decoder)?; log::trace!("generic reader: searching via multiline strategy"); MultiLine::new( self, matcher, &*self.multi_line_buffer.borrow(), write_to, ) .run() } else { let mut line_buffer = self.line_buffer.borrow_mut(); let rdr = LineBufferReader::new(decoder, &mut *line_buffer); log::trace!("generic reader: searching via roll buffer strategy"); ReadByLine::new(self, matcher, rdr, write_to).run() } } /// Execute a search over the given slice and write the results to the /// given sink. pub fn search_slice( &mut self, matcher: M, slice: &[u8], write_to: S, ) -> Result<(), S::Error> where M: Matcher, S: Sink, { self.check_config(&matcher).map_err(S::Error::error_config)?; // We can search the slice directly, unless we need to do transcoding. if self.slice_needs_transcoding(slice) { log::trace!( "slice reader: needs transcoding, using generic reader" ); return self.search_reader(matcher, slice, write_to); } if self.multi_line_with_matcher(&matcher) { log::trace!("slice reader: searching via multiline strategy"); MultiLine::new(self, matcher, slice, write_to).run() } else { log::trace!("slice reader: searching via slice-by-line strategy"); SliceByLine::new(self, matcher, slice, write_to).run() } } /// Set the binary detection method used on this searcher. pub fn set_binary_detection(&mut self, detection: BinaryDetection) { self.config.binary = detection.clone(); self.line_buffer.borrow_mut().set_binary_detection(detection.0); } /// Check that the searcher's configuration and the matcher are consistent /// with each other. fn check_config(&self, matcher: M) -> Result<(), ConfigError> { if self.config.heap_limit == Some(0) && !self.config.mmap.is_enabled() { return Err(ConfigError::SearchUnavailable); } let matcher_line_term = match matcher.line_terminator() { None => return Ok(()), Some(line_term) => line_term, }; if matcher_line_term != self.config.line_term { return Err(ConfigError::MismatchedLineTerminators { matcher: matcher_line_term, searcher: self.config.line_term, }); } Ok(()) } /// Returns true if and only if the given slice needs to be transcoded. fn slice_needs_transcoding(&self, slice: &[u8]) -> bool { self.config.encoding.is_some() || (self.config.bom_sniffing && slice_has_bom(slice)) } } /// The following methods permit querying the configuration of a searcher. /// These can be useful in generic implementations of /// [`Sink`](trait.Sink.html), /// where the output may be tailored based on how the searcher is configured. impl Searcher { /// Returns the line terminator used by this searcher. #[inline] pub fn line_terminator(&self) -> LineTerminator { self.config.line_term } /// Returns the type of binary detection configured on this searcher. #[inline] pub fn binary_detection(&self) -> &BinaryDetection { &self.config.binary } /// Returns true if and only if this searcher is configured to invert its /// search results. That is, matching lines are lines that do **not** match /// the searcher's matcher. #[inline] pub fn invert_match(&self) -> bool { self.config.invert_match } /// Returns true if and only if this searcher is configured to count line /// numbers. #[inline] pub fn line_number(&self) -> bool { self.config.line_number } /// Returns true if and only if this searcher is configured to perform /// multi line search. #[inline] pub fn multi_line(&self) -> bool { self.config.multi_line } /// Returns true if and only if this searcher will choose a multi-line /// strategy given the provided matcher. /// /// This may diverge from the result of `multi_line` in cases where the /// searcher has been configured to execute a search that can report /// matches over multiple lines, but where the matcher guarantees that it /// will never produce a match over multiple lines. pub fn multi_line_with_matcher(&self, matcher: M) -> bool { if !self.multi_line() { return false; } if let Some(line_term) = matcher.line_terminator() { if line_term == self.line_terminator() { return false; } } if let Some(non_matching) = matcher.non_matching_bytes() { // If the line terminator is CRLF, we don't actually need to care // whether the regex can match `\r` or not. Namely, a `\r` is // neither necessary nor sufficient to terminate a line. A `\n` is // always required. if non_matching.contains(self.line_terminator().as_byte()) { return false; } } true } /// Returns the number of "after" context lines to report. When context /// reporting is not enabled, this returns `0`. #[inline] pub fn after_context(&self) -> usize { self.config.after_context } /// Returns the number of "before" context lines to report. When context /// reporting is not enabled, this returns `0`. #[inline] pub fn before_context(&self) -> usize { self.config.before_context } /// Returns true if and only if the searcher has "passthru" mode enabled. #[inline] pub fn passthru(&self) -> bool { self.config.passthru } /// Fill the buffer for use with multi-line searching from the given file. /// This reads from the file until EOF or until an error occurs. If the /// contents exceed the configured heap limit, then an error is returned. fn fill_multi_line_buffer_from_file( &self, file: &File, ) -> Result<(), S::Error> { assert!(self.config.multi_line); let mut decode_buffer = self.decode_buffer.borrow_mut(); let mut read_from = self .decode_builder .build_with_buffer(file, &mut *decode_buffer) .map_err(S::Error::error_io)?; // If we don't have a heap limit, then we can defer to std's // read_to_end implementation. fill_multi_line_buffer_from_reader will // do this too, but since we have a File, we can be a bit smarter about // pre-allocating here. // // If we're transcoding, then our pre-allocation might not be exact, // but is probably still better than nothing. if self.config.heap_limit.is_none() { let mut buf = self.multi_line_buffer.borrow_mut(); buf.clear(); let cap = file.metadata().map(|m| m.len() as usize + 1).unwrap_or(0); buf.reserve(cap); read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?; return Ok(()); } self.fill_multi_line_buffer_from_reader::<_, S>(read_from) } /// Fill the buffer for use with multi-line searching from the given /// reader. This reads from the reader until EOF or until an error occurs. /// If the contents exceed the configured heap limit, then an error is /// returned. fn fill_multi_line_buffer_from_reader( &self, mut read_from: R, ) -> Result<(), S::Error> { assert!(self.config.multi_line); let mut buf = self.multi_line_buffer.borrow_mut(); buf.clear(); // If we don't have a heap limit, then we can defer to std's // read_to_end implementation... let heap_limit = match self.config.heap_limit { Some(heap_limit) => heap_limit, None => { read_from .read_to_end(&mut *buf) .map_err(S::Error::error_io)?; return Ok(()); } }; if heap_limit == 0 { return Err(S::Error::error_io(alloc_error(heap_limit))); } // ... otherwise we need to roll our own. This is likely quite a bit // slower than what is optimal, but we avoid worry about memory safety // until there's a compelling reason to speed this up. buf.resize(cmp::min(DEFAULT_BUFFER_CAPACITY, heap_limit), 0); let mut pos = 0; loop { let nread = match read_from.read(&mut buf[pos..]) { Ok(nread) => nread, Err(ref err) if err.kind() == io::ErrorKind::Interrupted => { continue; } Err(err) => return Err(S::Error::error_io(err)), }; if nread == 0 { buf.resize(pos, 0); return Ok(()); } pos += nread; if buf[pos..].is_empty() { let additional = heap_limit - buf.len(); if additional == 0 { return Err(S::Error::error_io(alloc_error(heap_limit))); } let limit = buf.len() + additional; let doubled = 2 * buf.len(); buf.resize(cmp::min(doubled, limit), 0); } } } } /// Returns true if and only if the given slice begins with a UTF-8 or UTF-16 /// BOM. /// /// This is used by the searcher to determine if a transcoder is necessary. /// Otherwise, it is advantageous to search the slice directly. fn slice_has_bom(slice: &[u8]) -> bool { let enc = match encoding_rs::Encoding::for_bom(slice) { None => return false, Some((enc, _)) => enc, }; [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE, encoding_rs::UTF_8] .contains(&enc) } #[cfg(test)] mod tests { use super::*; use crate::testutil::{KitchenSink, RegexMatcher}; #[test] fn config_error_heap_limit() { let matcher = RegexMatcher::new(""); let sink = KitchenSink::new(); let mut searcher = SearcherBuilder::new().heap_limit(Some(0)).build(); let res = searcher.search_slice(matcher, &[], sink); assert!(res.is_err()); } #[test] fn config_error_line_terminator() { let mut matcher = RegexMatcher::new(""); matcher.set_line_term(Some(LineTerminator::byte(b'z'))); let sink = KitchenSink::new(); let mut searcher = Searcher::new(); let res = searcher.search_slice(matcher, &[], sink); assert!(res.is_err()); } #[test] fn uft8_bom_sniffing() { // See: https://github.com/BurntSushi/ripgrep/issues/1638 // ripgrep must sniff utf-8 BOM, just like it does with utf-16 let matcher = RegexMatcher::new("foo"); let haystack: &[u8] = &[0xef, 0xbb, 0xbf, 0x66, 0x6f, 0x6f]; let mut sink = KitchenSink::new(); let mut searcher = SearcherBuilder::new().build(); let res = searcher.search_slice(matcher, haystack, &mut sink); assert!(res.is_ok()); let sink_output = String::from_utf8(sink.as_bytes().to_vec()).unwrap(); assert_eq!(sink_output, "1:0:foo\nbyte count:3\n"); } } grep-searcher-0.1.8/src/sink.rs000064400000000000000000000546060072674642500145250ustar 00000000000000use std::error; use std::fmt; use std::io; use grep_matcher::LineTerminator; use crate::lines::LineIter; use crate::searcher::{ConfigError, Searcher}; /// A trait that describes errors that can be reported by searchers and /// implementations of `Sink`. /// /// Unless you have a specialized use case, you probably don't need to /// implement this trait explicitly. It's likely that using `io::Error` (which /// implements this trait) for your error type is good enough, largely because /// most errors that occur during search will likely be an `io::Error`. pub trait SinkError: Sized { /// A constructor for converting any value that satisfies the /// `fmt::Display` trait into an error. fn error_message(message: T) -> Self; /// A constructor for converting I/O errors that occur while searching into /// an error of this type. /// /// By default, this is implemented via the `error_message` constructor. fn error_io(err: io::Error) -> Self { Self::error_message(err) } /// A constructor for converting configuration errors that occur while /// building a searcher into an error of this type. /// /// By default, this is implemented via the `error_message` constructor. fn error_config(err: ConfigError) -> Self { Self::error_message(err) } } /// An `io::Error` can be used as an error for `Sink` implementations out of /// the box. impl SinkError for io::Error { fn error_message(message: T) -> io::Error { io::Error::new(io::ErrorKind::Other, message.to_string()) } fn error_io(err: io::Error) -> io::Error { err } } /// A `Box` can be used as an error for `Sink` /// implementations out of the box. impl SinkError for Box { fn error_message(message: T) -> Box { Box::::from(message.to_string()) } } /// A trait that defines how results from searchers are handled. /// /// In this crate, a searcher follows the "push" model. What that means is that /// the searcher drives execution, and pushes results back to the caller. This /// is in contrast to a "pull" model where the caller drives execution and /// takes results as they need them. These are also known as "internal" and /// "external" iteration strategies, respectively. /// /// For a variety of reasons, including the complexity of the searcher /// implementation, this crate chooses the "push" or "internal" model of /// execution. Thus, in order to act on search results, callers must provide /// an implementation of this trait to a searcher, and the searcher is then /// responsible for calling the methods on this trait. /// /// This trait defines several behaviors: /// /// * What to do when a match is found. Callers must provide this. /// * What to do when an error occurs. Callers must provide this via the /// [`SinkError`](trait.SinkError.html) trait. Generally, callers can just /// use `io::Error` for this, which already implements `SinkError`. /// * What to do when a contextual line is found. By default, these are /// ignored. /// * What to do when a gap between contextual lines has been found. By /// default, this is ignored. /// * What to do when a search has started. By default, this does nothing. /// * What to do when a search has finished successfully. By default, this does /// nothing. /// /// Callers must, at minimum, specify the behavior when an error occurs and /// the behavior when a match occurs. The rest is optional. For each behavior, /// callers may report an error (say, if writing the result to another /// location failed) or simply return `false` if they want the search to stop /// (e.g., when implementing a cap on the number of search results to show). /// /// When errors are reported (whether in the searcher or in the implementation /// of `Sink`), then searchers quit immediately without calling `finish`. /// /// For simpler uses of `Sink`, callers may elect to use one of /// the more convenient but less flexible implementations in the /// [`sinks`](sinks/index.html) module. pub trait Sink { /// The type of an error that should be reported by a searcher. /// /// Errors of this type are not only returned by the methods on this /// trait, but the constructors defined in `SinkError` are also used in /// the searcher implementation itself. e.g., When a I/O error occurs when /// reading data from a file. type Error: SinkError; /// This method is called whenever a match is found. /// /// If multi line is enabled on the searcher, then the match reported here /// may span multiple lines and it may include multiple matches. When multi /// line is disabled, then the match is guaranteed to span exactly one /// non-empty line (where a single line is, at minimum, a line terminator). /// /// If this returns `true`, then searching continues. If this returns /// `false`, then searching is stopped immediately and `finish` is called. /// /// If this returns an error, then searching is stopped immediately, /// `finish` is not called and the error is bubbled back up to the caller /// of the searcher. fn matched( &mut self, _searcher: &Searcher, _mat: &SinkMatch<'_>, ) -> Result; /// This method is called whenever a context line is found, and is optional /// to implement. By default, it does nothing and returns `true`. /// /// In all cases, the context given is guaranteed to span exactly one /// non-empty line (where a single line is, at minimum, a line terminator). /// /// If this returns `true`, then searching continues. If this returns /// `false`, then searching is stopped immediately and `finish` is called. /// /// If this returns an error, then searching is stopped immediately, /// `finish` is not called and the error is bubbled back up to the caller /// of the searcher. #[inline] fn context( &mut self, _searcher: &Searcher, _context: &SinkContext<'_>, ) -> Result { Ok(true) } /// This method is called whenever a break in contextual lines is found, /// and is optional to implement. By default, it does nothing and returns /// `true`. /// /// A break can only occur when context reporting is enabled (that is, /// either or both of `before_context` or `after_context` are greater than /// `0`). More precisely, a break occurs between non-contiguous groups of /// lines. /// /// If this returns `true`, then searching continues. If this returns /// `false`, then searching is stopped immediately and `finish` is called. /// /// If this returns an error, then searching is stopped immediately, /// `finish` is not called and the error is bubbled back up to the caller /// of the searcher. #[inline] fn context_break( &mut self, _searcher: &Searcher, ) -> Result { Ok(true) } /// This method is called whenever binary detection is enabled and binary /// data is found. If binary data is found, then this is called at least /// once for the first occurrence with the absolute byte offset at which /// the binary data begins. /// /// If this returns `true`, then searching continues. If this returns /// `false`, then searching is stopped immediately and `finish` is called. /// /// If this returns an error, then searching is stopped immediately, /// `finish` is not called and the error is bubbled back up to the caller /// of the searcher. /// /// By default, it does nothing and returns `true`. #[inline] fn binary_data( &mut self, _searcher: &Searcher, _binary_byte_offset: u64, ) -> Result { Ok(true) } /// This method is called when a search has begun, before any search is /// executed. By default, this does nothing. /// /// If this returns `true`, then searching continues. If this returns /// `false`, then searching is stopped immediately and `finish` is called. /// /// If this returns an error, then searching is stopped immediately, /// `finish` is not called and the error is bubbled back up to the caller /// of the searcher. #[inline] fn begin(&mut self, _searcher: &Searcher) -> Result { Ok(true) } /// This method is called when a search has completed. By default, this /// does nothing. /// /// If this returns an error, the error is bubbled back up to the caller of /// the searcher. #[inline] fn finish( &mut self, _searcher: &Searcher, _: &SinkFinish, ) -> Result<(), Self::Error> { Ok(()) } } impl<'a, S: Sink> Sink for &'a mut S { type Error = S::Error; #[inline] fn matched( &mut self, searcher: &Searcher, mat: &SinkMatch<'_>, ) -> Result { (**self).matched(searcher, mat) } #[inline] fn context( &mut self, searcher: &Searcher, context: &SinkContext<'_>, ) -> Result { (**self).context(searcher, context) } #[inline] fn context_break( &mut self, searcher: &Searcher, ) -> Result { (**self).context_break(searcher) } #[inline] fn binary_data( &mut self, searcher: &Searcher, binary_byte_offset: u64, ) -> Result { (**self).binary_data(searcher, binary_byte_offset) } #[inline] fn begin(&mut self, searcher: &Searcher) -> Result { (**self).begin(searcher) } #[inline] fn finish( &mut self, searcher: &Searcher, sink_finish: &SinkFinish, ) -> Result<(), S::Error> { (**self).finish(searcher, sink_finish) } } impl Sink for Box { type Error = S::Error; #[inline] fn matched( &mut self, searcher: &Searcher, mat: &SinkMatch<'_>, ) -> Result { (**self).matched(searcher, mat) } #[inline] fn context( &mut self, searcher: &Searcher, context: &SinkContext<'_>, ) -> Result { (**self).context(searcher, context) } #[inline] fn context_break( &mut self, searcher: &Searcher, ) -> Result { (**self).context_break(searcher) } #[inline] fn binary_data( &mut self, searcher: &Searcher, binary_byte_offset: u64, ) -> Result { (**self).binary_data(searcher, binary_byte_offset) } #[inline] fn begin(&mut self, searcher: &Searcher) -> Result { (**self).begin(searcher) } #[inline] fn finish( &mut self, searcher: &Searcher, sink_finish: &SinkFinish, ) -> Result<(), S::Error> { (**self).finish(searcher, sink_finish) } } /// Summary data reported at the end of a search. /// /// This reports data such as the total number of bytes searched and the /// absolute offset of the first occurrence of binary data, if any were found. /// /// A searcher that stops early because of an error does not call `finish`. /// A searcher that stops early because the `Sink` implementor instructed it /// to will still call `finish`. #[derive(Clone, Debug)] pub struct SinkFinish { pub(crate) byte_count: u64, pub(crate) binary_byte_offset: Option, } impl SinkFinish { /// Return the total number of bytes searched. #[inline] pub fn byte_count(&self) -> u64 { self.byte_count } /// If binary detection is enabled and if binary data was found, then this /// returns the absolute byte offset of the first detected byte of binary /// data. /// /// Note that since this is an absolute byte offset, it cannot be relied /// upon to index into any addressable memory. #[inline] pub fn binary_byte_offset(&self) -> Option { self.binary_byte_offset } } /// A type that describes a match reported by a searcher. #[derive(Clone, Debug)] pub struct SinkMatch<'b> { pub(crate) line_term: LineTerminator, pub(crate) bytes: &'b [u8], pub(crate) absolute_byte_offset: u64, pub(crate) line_number: Option, pub(crate) buffer: &'b [u8], pub(crate) bytes_range_in_buffer: std::ops::Range, } impl<'b> SinkMatch<'b> { /// Returns the bytes for all matching lines, including the line /// terminators, if they exist. #[inline] pub fn bytes(&self) -> &'b [u8] { self.bytes } /// Return an iterator over the lines in this match. /// /// If multi line search is enabled, then this may yield more than one /// line (but always at least one line). If multi line search is disabled, /// then this always reports exactly one line (but may consist of just /// the line terminator). /// /// Lines yielded by this iterator include their terminators. #[inline] pub fn lines(&self) -> LineIter<'b> { LineIter::new(self.line_term.as_byte(), self.bytes) } /// Returns the absolute byte offset of the start of this match. This /// offset is absolute in that it is relative to the very beginning of the /// input in a search, and can never be relied upon to be a valid index /// into an in-memory slice. #[inline] pub fn absolute_byte_offset(&self) -> u64 { self.absolute_byte_offset } /// Returns the line number of the first line in this match, if available. /// /// Line numbers are only available when the search builder is instructed /// to compute them. #[inline] pub fn line_number(&self) -> Option { self.line_number } /// TODO #[inline] pub fn buffer(&self) -> &'b [u8] { self.buffer } /// TODO #[inline] pub fn bytes_range_in_buffer(&self) -> std::ops::Range { self.bytes_range_in_buffer.clone() } } /// The type of context reported by a searcher. #[derive(Clone, Debug, Eq, PartialEq)] pub enum SinkContextKind { /// The line reported occurred before a match. Before, /// The line reported occurred after a match. After, /// Any other type of context reported, e.g., as a result of a searcher's /// "passthru" mode. Other, } /// A type that describes a contextual line reported by a searcher. #[derive(Clone, Debug)] pub struct SinkContext<'b> { pub(crate) line_term: LineTerminator, pub(crate) bytes: &'b [u8], pub(crate) kind: SinkContextKind, pub(crate) absolute_byte_offset: u64, pub(crate) line_number: Option, } impl<'b> SinkContext<'b> { /// Returns the context bytes, including line terminators. #[inline] pub fn bytes(&self) -> &'b [u8] { self.bytes } /// Returns the type of context. #[inline] pub fn kind(&self) -> &SinkContextKind { &self.kind } /// Return an iterator over the lines in this match. /// /// This always yields exactly one line (and that one line may contain just /// the line terminator). /// /// Lines yielded by this iterator include their terminators. #[cfg(test)] pub(crate) fn lines(&self) -> LineIter<'b> { LineIter::new(self.line_term.as_byte(), self.bytes) } /// Returns the absolute byte offset of the start of this context. This /// offset is absolute in that it is relative to the very beginning of the /// input in a search, and can never be relied upon to be a valid index /// into an in-memory slice. #[inline] pub fn absolute_byte_offset(&self) -> u64 { self.absolute_byte_offset } /// Returns the line number of the first line in this context, if /// available. /// /// Line numbers are only available when the search builder is instructed /// to compute them. #[inline] pub fn line_number(&self) -> Option { self.line_number } } /// A collection of convenience implementations of `Sink`. /// /// Each implementation in this module makes some kind of sacrifice in the name /// of making common cases easier to use. Most frequently, each type is a /// wrapper around a closure specified by the caller that provides limited /// access to the full suite of information available to implementors of /// `Sink`. /// /// For example, the `UTF8` sink makes the following sacrifices: /// /// * All matches must be UTF-8. An arbitrary `Sink` does not have this /// restriction and can deal with arbitrary data. If this sink sees invalid /// UTF-8, then an error is returned and searching stops. (Use the `Lossy` /// sink instead to suppress this error.) /// * The searcher must be configured to report line numbers. If it isn't, /// an error is reported at the first match and searching stops. /// * Context lines, context breaks and summary data reported at the end of /// a search are all ignored. /// * Implementors are forced to use `io::Error` as their error type. /// /// If you need more flexibility, then you're advised to implement the `Sink` /// trait directly. pub mod sinks { use std::io; use std::str; use super::{Sink, SinkError, SinkMatch}; use crate::searcher::Searcher; /// A sink that provides line numbers and matches as strings while ignoring /// everything else. /// /// This implementation will return an error if a match contains invalid /// UTF-8 or if the searcher was not configured to count lines. Errors /// on invalid UTF-8 can be suppressed by using the `Lossy` sink instead /// of this one. /// /// The closure accepts two parameters: a line number and a UTF-8 string /// containing the matched data. The closure returns a /// `Result`. If the `bool` is `false`, then the search /// stops immediately. Otherwise, searching continues. /// /// If multi line mode was enabled, the line number refers to the line /// number of the first line in the match. #[derive(Clone, Debug)] pub struct UTF8(pub F) where F: FnMut(u64, &str) -> Result; impl Sink for UTF8 where F: FnMut(u64, &str) -> Result, { type Error = io::Error; fn matched( &mut self, _searcher: &Searcher, mat: &SinkMatch<'_>, ) -> Result { let matched = match str::from_utf8(mat.bytes()) { Ok(matched) => matched, Err(err) => return Err(io::Error::error_message(err)), }; let line_number = match mat.line_number() { Some(line_number) => line_number, None => { let msg = "line numbers not enabled"; return Err(io::Error::error_message(msg)); } }; (self.0)(line_number, &matched) } } /// A sink that provides line numbers and matches as (lossily converted) /// strings while ignoring everything else. /// /// This is like `UTF8`, except that if a match contains invalid UTF-8, /// then it will be lossily converted to valid UTF-8 by substituting /// invalid UTF-8 with Unicode replacement characters. /// /// This implementation will return an error on the first match if the /// searcher was not configured to count lines. /// /// The closure accepts two parameters: a line number and a UTF-8 string /// containing the matched data. The closure returns a /// `Result`. If the `bool` is `false`, then the search /// stops immediately. Otherwise, searching continues. /// /// If multi line mode was enabled, the line number refers to the line /// number of the first line in the match. #[derive(Clone, Debug)] pub struct Lossy(pub F) where F: FnMut(u64, &str) -> Result; impl Sink for Lossy where F: FnMut(u64, &str) -> Result, { type Error = io::Error; fn matched( &mut self, _searcher: &Searcher, mat: &SinkMatch<'_>, ) -> Result { use std::borrow::Cow; let matched = match str::from_utf8(mat.bytes()) { Ok(matched) => Cow::Borrowed(matched), // TODO: In theory, it should be possible to amortize // allocation here, but `std` doesn't provide such an API. // Regardless, this only happens on matches with invalid UTF-8, // which should be pretty rare. Err(_) => String::from_utf8_lossy(mat.bytes()), }; let line_number = match mat.line_number() { Some(line_number) => line_number, None => { let msg = "line numbers not enabled"; return Err(io::Error::error_message(msg)); } }; (self.0)(line_number, &matched) } } /// A sink that provides line numbers and matches as raw bytes while /// ignoring everything else. /// /// This implementation will return an error on the first match if the /// searcher was not configured to count lines. /// /// The closure accepts two parameters: a line number and a raw byte string /// containing the matched data. The closure returns a `Result`. If the `bool` is `false`, then the search stops /// immediately. Otherwise, searching continues. /// /// If multi line mode was enabled, the line number refers to the line /// number of the first line in the match. #[derive(Clone, Debug)] pub struct Bytes(pub F) where F: FnMut(u64, &[u8]) -> Result; impl Sink for Bytes where F: FnMut(u64, &[u8]) -> Result, { type Error = io::Error; fn matched( &mut self, _searcher: &Searcher, mat: &SinkMatch<'_>, ) -> Result { let line_number = match mat.line_number() { Some(line_number) => line_number, None => { let msg = "line numbers not enabled"; return Err(io::Error::error_message(msg)); } }; (self.0)(line_number, mat.bytes()) } } } grep-searcher-0.1.8/src/testutil.rs000064400000000000000000000653750072674642500154430ustar 00000000000000use std::io::{self, Write}; use std::str; use bstr::ByteSlice; use grep_matcher::{ LineMatchKind, LineTerminator, Match, Matcher, NoCaptures, NoError, }; use regex::bytes::{Regex, RegexBuilder}; use crate::searcher::{BinaryDetection, Searcher, SearcherBuilder}; use crate::sink::{Sink, SinkContext, SinkFinish, SinkMatch}; /// A simple regex matcher. /// /// This supports setting the matcher's line terminator configuration directly, /// which we use for testing purposes. That is, the caller explicitly /// determines whether the line terminator optimization is enabled. (In reality /// this optimization is detected automatically by inspecting and possibly /// modifying the regex itself.) #[derive(Clone, Debug)] pub struct RegexMatcher { regex: Regex, line_term: Option, every_line_is_candidate: bool, } impl RegexMatcher { /// Create a new regex matcher. pub fn new(pattern: &str) -> RegexMatcher { let regex = RegexBuilder::new(pattern) .multi_line(true) // permits ^ and $ to match at \n boundaries .build() .unwrap(); RegexMatcher { regex: regex, line_term: None, every_line_is_candidate: false, } } /// Forcefully set the line terminator of this matcher. /// /// By default, this matcher has no line terminator set. pub fn set_line_term( &mut self, line_term: Option, ) -> &mut RegexMatcher { self.line_term = line_term; self } /// Whether to return every line as a candidate or not. /// /// This forces searchers to handle the case of reporting a false positive. pub fn every_line_is_candidate(&mut self, yes: bool) -> &mut RegexMatcher { self.every_line_is_candidate = yes; self } } impl Matcher for RegexMatcher { type Captures = NoCaptures; type Error = NoError; fn find_at( &self, haystack: &[u8], at: usize, ) -> Result, NoError> { Ok(self .regex .find_at(haystack, at) .map(|m| Match::new(m.start(), m.end()))) } fn new_captures(&self) -> Result { Ok(NoCaptures::new()) } fn line_terminator(&self) -> Option { self.line_term } fn find_candidate_line( &self, haystack: &[u8], ) -> Result, NoError> { if self.every_line_is_candidate { assert!(self.line_term.is_some()); if haystack.is_empty() { return Ok(None); } // Make it interesting and return the last byte in the current // line. let i = haystack .find_byte(self.line_term.unwrap().as_byte()) .map(|i| i) .unwrap_or(haystack.len() - 1); Ok(Some(LineMatchKind::Candidate(i))) } else { Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)) } } } /// An implementation of Sink that prints all available information. /// /// This is useful for tests because it lets us easily confirm whether data /// is being passed to Sink correctly. #[derive(Clone, Debug)] pub struct KitchenSink(Vec); impl KitchenSink { /// Create a new implementation of Sink that includes everything in the /// kitchen. pub fn new() -> KitchenSink { KitchenSink(vec![]) } /// Return the data written to this sink. pub fn as_bytes(&self) -> &[u8] { &self.0 } } impl Sink for KitchenSink { type Error = io::Error; fn matched( &mut self, _searcher: &Searcher, mat: &SinkMatch<'_>, ) -> Result { assert!(!mat.bytes().is_empty()); assert!(mat.lines().count() >= 1); let mut line_number = mat.line_number(); let mut byte_offset = mat.absolute_byte_offset(); for line in mat.lines() { if let Some(ref mut n) = line_number { write!(self.0, "{}:", n)?; *n += 1; } write!(self.0, "{}:", byte_offset)?; byte_offset += line.len() as u64; self.0.write_all(line)?; } Ok(true) } fn context( &mut self, _searcher: &Searcher, context: &SinkContext<'_>, ) -> Result { assert!(!context.bytes().is_empty()); assert!(context.lines().count() == 1); if let Some(line_number) = context.line_number() { write!(self.0, "{}-", line_number)?; } write!(self.0, "{}-", context.absolute_byte_offset)?; self.0.write_all(context.bytes())?; Ok(true) } fn context_break( &mut self, _searcher: &Searcher, ) -> Result { self.0.write_all(b"--\n")?; Ok(true) } fn finish( &mut self, _searcher: &Searcher, sink_finish: &SinkFinish, ) -> Result<(), io::Error> { writeln!(self.0, "")?; writeln!(self.0, "byte count:{}", sink_finish.byte_count())?; if let Some(offset) = sink_finish.binary_byte_offset() { writeln!(self.0, "binary offset:{}", offset)?; } Ok(()) } } /// A type for expressing tests on a searcher. /// /// The searcher code has a lot of different code paths, mostly for the /// purposes of optimizing a bunch of different use cases. The intent of the /// searcher is to pick the best code path based on the configuration, which /// means there is no obviously direct way to ask that a specific code path /// be exercised. Thus, the purpose of this tester is to explicitly check as /// many code paths that make sense. /// /// The tester works by assuming you want to test all pertinent code paths. /// These can be trimmed down as necessary via the various builder methods. #[derive(Debug)] pub struct SearcherTester { haystack: String, pattern: String, filter: Option<::regex::Regex>, print_labels: bool, expected_no_line_number: Option, expected_with_line_number: Option, expected_slice_no_line_number: Option, expected_slice_with_line_number: Option, by_line: bool, multi_line: bool, invert_match: bool, line_number: bool, binary: BinaryDetection, auto_heap_limit: bool, after_context: usize, before_context: usize, passthru: bool, } impl SearcherTester { /// Create a new tester for testing searchers. pub fn new(haystack: &str, pattern: &str) -> SearcherTester { SearcherTester { haystack: haystack.to_string(), pattern: pattern.to_string(), filter: None, print_labels: false, expected_no_line_number: None, expected_with_line_number: None, expected_slice_no_line_number: None, expected_slice_with_line_number: None, by_line: true, multi_line: true, invert_match: false, line_number: true, binary: BinaryDetection::none(), auto_heap_limit: true, after_context: 0, before_context: 0, passthru: false, } } /// Execute the test. If the test succeeds, then this returns successfully. /// If the test fails, then it panics with an informative message. pub fn test(&self) { // Check for configuration errors. if self.expected_no_line_number.is_none() { panic!("an 'expected' string with NO line numbers must be given"); } if self.line_number && self.expected_with_line_number.is_none() { panic!( "an 'expected' string with line numbers must be given, \ or disable testing with line numbers" ); } let configs = self.configs(); if configs.is_empty() { panic!("test configuration resulted in nothing being tested"); } if self.print_labels { for config in &configs { let labels = vec![ format!("reader-{}", config.label), format!("slice-{}", config.label), ]; for label in &labels { if self.include(label) { println!("{}", label); } else { println!("{} (ignored)", label); } } } } for config in &configs { let label = format!("reader-{}", config.label); if self.include(&label) { let got = config.search_reader(&self.haystack); assert_eq_printed!(config.expected_reader, got, "{}", label); } let label = format!("slice-{}", config.label); if self.include(&label) { let got = config.search_slice(&self.haystack); assert_eq_printed!(config.expected_slice, got, "{}", label); } } } /// Set a regex pattern to filter the tests that are run. /// /// By default, no filter is present. When a filter is set, only test /// configurations with a label matching the given pattern will be run. /// /// This is often useful when debugging tests, e.g., when you want to do /// printf debugging and only want one particular test configuration to /// execute. #[allow(dead_code)] pub fn filter(&mut self, pattern: &str) -> &mut SearcherTester { self.filter = Some(::regex::Regex::new(pattern).unwrap()); self } /// When set, the labels for all test configurations are printed before /// executing any test. /// /// Note that in order to see these in tests that aren't failing, you'll /// want to use `cargo test -- --nocapture`. #[allow(dead_code)] pub fn print_labels(&mut self, yes: bool) -> &mut SearcherTester { self.print_labels = yes; self } /// Set the expected search results, without line numbers. pub fn expected_no_line_number( &mut self, exp: &str, ) -> &mut SearcherTester { self.expected_no_line_number = Some(exp.to_string()); self } /// Set the expected search results, with line numbers. pub fn expected_with_line_number( &mut self, exp: &str, ) -> &mut SearcherTester { self.expected_with_line_number = Some(exp.to_string()); self } /// Set the expected search results, without line numbers, when performing /// a search on a slice. When not present, `expected_no_line_number` is /// used instead. pub fn expected_slice_no_line_number( &mut self, exp: &str, ) -> &mut SearcherTester { self.expected_slice_no_line_number = Some(exp.to_string()); self } /// Set the expected search results, with line numbers, when performing a /// search on a slice. When not present, `expected_with_line_number` is /// used instead. #[allow(dead_code)] pub fn expected_slice_with_line_number( &mut self, exp: &str, ) -> &mut SearcherTester { self.expected_slice_with_line_number = Some(exp.to_string()); self } /// Whether to test search with line numbers or not. /// /// This is enabled by default. When enabled, the string that is expected /// when line numbers are present must be provided. Otherwise, the expected /// string isn't required. pub fn line_number(&mut self, yes: bool) -> &mut SearcherTester { self.line_number = yes; self } /// Whether to test search using the line-by-line searcher or not. /// /// By default, this is enabled. pub fn by_line(&mut self, yes: bool) -> &mut SearcherTester { self.by_line = yes; self } /// Whether to test search using the multi line searcher or not. /// /// By default, this is enabled. #[allow(dead_code)] pub fn multi_line(&mut self, yes: bool) -> &mut SearcherTester { self.multi_line = yes; self } /// Whether to perform an inverted search or not. /// /// By default, this is disabled. pub fn invert_match(&mut self, yes: bool) -> &mut SearcherTester { self.invert_match = yes; self } /// Whether to enable binary detection on all searches. /// /// By default, this is disabled. pub fn binary_detection( &mut self, detection: BinaryDetection, ) -> &mut SearcherTester { self.binary = detection; self } /// Whether to automatically attempt to test the heap limit setting or not. /// /// By default, one of the test configurations includes setting the heap /// limit to its minimal value for normal operation, which checks that /// everything works even at the extremes. However, in some cases, the heap /// limit can (expectedly) alter the output slightly. For example, it can /// impact the number of bytes searched when performing binary detection. /// For convenience, it can be useful to disable the automatic heap limit /// test. pub fn auto_heap_limit(&mut self, yes: bool) -> &mut SearcherTester { self.auto_heap_limit = yes; self } /// Set the number of lines to include in the "after" context. /// /// The default is `0`, which is equivalent to not printing any context. pub fn after_context(&mut self, lines: usize) -> &mut SearcherTester { self.after_context = lines; self } /// Set the number of lines to include in the "before" context. /// /// The default is `0`, which is equivalent to not printing any context. pub fn before_context(&mut self, lines: usize) -> &mut SearcherTester { self.before_context = lines; self } /// Whether to enable the "passthru" feature or not. /// /// When passthru is enabled, it effectively treats all non-matching lines /// as contextual lines. In other words, enabling this is akin to /// requesting an unbounded number of before and after contextual lines. /// /// This is disabled by default. pub fn passthru(&mut self, yes: bool) -> &mut SearcherTester { self.passthru = yes; self } /// Return the minimum size of a buffer required for a successful search. /// /// Generally, this corresponds to the maximum length of a line (including /// its terminator), but if context settings are enabled, then this must /// include the sum of the longest N lines. /// /// Note that this must account for whether the test is using multi line /// search or not, since multi line search requires being able to fit the /// entire haystack into memory. fn minimal_heap_limit(&self, multi_line: bool) -> usize { if multi_line { 1 + self.haystack.len() } else if self.before_context == 0 && self.after_context == 0 { 1 + self.haystack.lines().map(|s| s.len()).max().unwrap_or(0) } else { let mut lens: Vec = self.haystack.lines().map(|s| s.len()).collect(); lens.sort(); lens.reverse(); let context_count = if self.passthru { self.haystack.lines().count() } else { // Why do we add 2 here? Well, we need to add 1 in order to // have room to search at least one line. We add another // because the implementation will occasionally include // an additional line when handling the context. There's // no particularly good reason, other than keeping the // implementation simple. 2 + self.before_context + self.after_context }; // We add 1 to each line since `str::lines` doesn't include the // line terminator. lens.into_iter() .take(context_count) .map(|len| len + 1) .sum::() } } /// Returns true if and only if the given label should be included as part /// of executing `test`. /// /// Inclusion is determined by the filter specified. If no filter has been /// given, then this always returns `true`. fn include(&self, label: &str) -> bool { let re = match self.filter { None => return true, Some(ref re) => re, }; re.is_match(label) } /// Configs generates a set of all search configurations that should be /// tested. The configs generated are based on the configuration in this /// builder. fn configs(&self) -> Vec { let mut configs = vec![]; let matcher = RegexMatcher::new(&self.pattern); let mut builder = SearcherBuilder::new(); builder .line_number(false) .invert_match(self.invert_match) .binary_detection(self.binary.clone()) .after_context(self.after_context) .before_context(self.before_context) .passthru(self.passthru); if self.by_line { let mut matcher = matcher.clone(); let mut builder = builder.clone(); let expected_reader = self.expected_no_line_number.as_ref().unwrap().to_string(); let expected_slice = match self.expected_slice_no_line_number { None => expected_reader.clone(), Some(ref e) => e.to_string(), }; configs.push(TesterConfig { label: "byline-noterm-nonumber".to_string(), expected_reader: expected_reader.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); if self.auto_heap_limit { builder.heap_limit(Some(self.minimal_heap_limit(false))); configs.push(TesterConfig { label: "byline-noterm-nonumber-heaplimit".to_string(), expected_reader: expected_reader.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); builder.heap_limit(None); } matcher.set_line_term(Some(LineTerminator::byte(b'\n'))); configs.push(TesterConfig { label: "byline-term-nonumber".to_string(), expected_reader: expected_reader.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); matcher.every_line_is_candidate(true); configs.push(TesterConfig { label: "byline-term-nonumber-candidates".to_string(), expected_reader: expected_reader.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); } if self.by_line && self.line_number { let mut matcher = matcher.clone(); let mut builder = builder.clone(); let expected_reader = self.expected_with_line_number.as_ref().unwrap().to_string(); let expected_slice = match self.expected_slice_with_line_number { None => expected_reader.clone(), Some(ref e) => e.to_string(), }; builder.line_number(true); configs.push(TesterConfig { label: "byline-noterm-number".to_string(), expected_reader: expected_reader.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); matcher.set_line_term(Some(LineTerminator::byte(b'\n'))); configs.push(TesterConfig { label: "byline-term-number".to_string(), expected_reader: expected_reader.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); matcher.every_line_is_candidate(true); configs.push(TesterConfig { label: "byline-term-number-candidates".to_string(), expected_reader: expected_reader.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); } if self.multi_line { let mut builder = builder.clone(); let expected_slice = match self.expected_slice_no_line_number { None => { self.expected_no_line_number.as_ref().unwrap().to_string() } Some(ref e) => e.to_string(), }; builder.multi_line(true); configs.push(TesterConfig { label: "multiline-nonumber".to_string(), expected_reader: expected_slice.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); if self.auto_heap_limit { builder.heap_limit(Some(self.minimal_heap_limit(true))); configs.push(TesterConfig { label: "multiline-nonumber-heaplimit".to_string(), expected_reader: expected_slice.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); builder.heap_limit(None); } } if self.multi_line && self.line_number { let mut builder = builder.clone(); let expected_slice = match self.expected_slice_with_line_number { None => self .expected_with_line_number .as_ref() .unwrap() .to_string(), Some(ref e) => e.to_string(), }; builder.multi_line(true); builder.line_number(true); configs.push(TesterConfig { label: "multiline-number".to_string(), expected_reader: expected_slice.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); builder.heap_limit(Some(self.minimal_heap_limit(true))); configs.push(TesterConfig { label: "multiline-number-heaplimit".to_string(), expected_reader: expected_slice.clone(), expected_slice: expected_slice.clone(), builder: builder.clone(), matcher: matcher.clone(), }); builder.heap_limit(None); } configs } } #[derive(Debug)] struct TesterConfig { label: String, expected_reader: String, expected_slice: String, builder: SearcherBuilder, matcher: RegexMatcher, } impl TesterConfig { /// Execute a search using a reader. This exercises the incremental search /// strategy, where the entire contents of the corpus aren't necessarily /// in memory at once. fn search_reader(&self, haystack: &str) -> String { let mut sink = KitchenSink::new(); let mut searcher = self.builder.build(); let result = searcher.search_reader( &self.matcher, haystack.as_bytes(), &mut sink, ); if let Err(err) = result { let label = format!("reader-{}", self.label); panic!("error running '{}': {}", label, err); } String::from_utf8(sink.as_bytes().to_vec()).unwrap() } /// Execute a search using a slice. This exercises the search routines that /// have the entire contents of the corpus in memory at one time. fn search_slice(&self, haystack: &str) -> String { let mut sink = KitchenSink::new(); let mut searcher = self.builder.build(); let result = searcher.search_slice( &self.matcher, haystack.as_bytes(), &mut sink, ); if let Err(err) = result { let label = format!("slice-{}", self.label); panic!("error running '{}': {}", label, err); } String::from_utf8(sink.as_bytes().to_vec()).unwrap() } } #[cfg(test)] mod tests { use grep_matcher::{Match, Matcher}; use super::*; fn m(start: usize, end: usize) -> Match { Match::new(start, end) } #[test] fn empty_line1() { let haystack = b""; let matcher = RegexMatcher::new(r"^$"); assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0)))); } #[test] fn empty_line2() { let haystack = b"\n"; let matcher = RegexMatcher::new(r"^$"); assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0)))); assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(1, 1)))); } #[test] fn empty_line3() { let haystack = b"\n\n"; let matcher = RegexMatcher::new(r"^$"); assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0)))); assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(1, 1)))); assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2)))); } #[test] fn empty_line4() { let haystack = b"a\n\nb\n"; let matcher = RegexMatcher::new(r"^$"); assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2)))); assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2)))); assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2)))); assert_eq!(matcher.find_at(haystack, 3), Ok(Some(m(5, 5)))); assert_eq!(matcher.find_at(haystack, 4), Ok(Some(m(5, 5)))); assert_eq!(matcher.find_at(haystack, 5), Ok(Some(m(5, 5)))); } #[test] fn empty_line5() { let haystack = b"a\n\nb\nc"; let matcher = RegexMatcher::new(r"^$"); assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2)))); assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2)))); assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2)))); assert_eq!(matcher.find_at(haystack, 3), Ok(None)); assert_eq!(matcher.find_at(haystack, 4), Ok(None)); assert_eq!(matcher.find_at(haystack, 5), Ok(None)); assert_eq!(matcher.find_at(haystack, 6), Ok(None)); } #[test] fn empty_line6() { let haystack = b"a\n"; let matcher = RegexMatcher::new(r"^$"); assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2)))); assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2)))); assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2)))); } }