memchr-2.2.1/.gitignore010064400017500000144000000001361335001377300131760ustar0000000000000000.*.swp doc tags examples/ss10pusa.csv build target /Cargo.lock scratch* bench_large/huge tmp/ memchr-2.2.1/COPYING010064400017500000144000000001761274016737000122520ustar0000000000000000This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. memchr-2.2.1/Cargo.toml.orig010064400017500000144000000015571351045150200140770ustar0000000000000000[package] name = "memchr" version = "2.2.1" #:version authors = ["Andrew Gallant ", "bluss"] description = "Safe interface to memchr." documentation = "https://docs.rs/memchr/" homepage = "https://github.com/BurntSushi/rust-memchr" repository = "https://github.com/BurntSushi/rust-memchr" readme = "README.md" keywords = ["memchr", "char", "scan", "strchr", "string"] license = "Unlicense/MIT" exclude = ["/ci/*", "/.travis.yml", "/Makefile", "/appveyor.yml"] [badges] travis-ci = { repository = "BurntSushi/rust-memchr" } appveyor = { repository = "BurntSushi/rust-memchr" } [lib] name = "memchr" bench = false [features] default = ["use_std"] use_std = [] [dependencies] libc = { version = "0.2.18", default-features = false, optional = true } [dev-dependencies] quickcheck = { version = "0.8", default-features = false } [profile.test] opt-level = 3 memchr-2.2.1/Cargo.toml0000644000000025330000000000000103510ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g. crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "memchr" version = "2.2.1" authors = ["Andrew Gallant ", "bluss"] exclude = ["/ci/*", "/.travis.yml", "/Makefile", "/appveyor.yml"] description = "Safe interface to memchr." homepage = "https://github.com/BurntSushi/rust-memchr" documentation = "https://docs.rs/memchr/" readme = "README.md" keywords = ["memchr", "char", "scan", "strchr", "string"] license = "Unlicense/MIT" repository = "https://github.com/BurntSushi/rust-memchr" [profile.test] opt-level = 3 [lib] name = "memchr" bench = false [dependencies.libc] version = "0.2.18" optional = true default-features = false [dev-dependencies.quickcheck] version = "0.8" default-features = false [features] default = ["use_std"] use_std = [] [badges.appveyor] repository = "BurntSushi/rust-memchr" [badges.travis-ci] repository = "BurntSushi/rust-memchr" memchr-2.2.1/Cargo.toml.orig0000644000000025340000000000000113110ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "memchr" version = "2.2.1" authors = ["Andrew Gallant ", "bluss"] exclude = ["/ci/*", "/.travis.yml", "/Makefile", "/appveyor.yml"] description = "Safe interface to memchr." homepage = "https://github.com/BurntSushi/rust-memchr" documentation = "https://docs.rs/memchr/" readme = "README.md" keywords = ["memchr", "char", "scan", "strchr", "string"] license = "Unlicense/MIT" repository = "https://github.com/BurntSushi/rust-memchr" [profile.test] opt-level = 3 [lib] name = "memchr" bench = false [dependencies.libc] version = "0.2.18" optional = true default-features = false [dev-dependencies.quickcheck] version = "0.8" default-features = false [features] default = ["use_std"] use_std = [] [badges.appveyor] repository = "BurntSushi/rust-memchr" [badges.travis-ci] repository = "BurntSushi/rust-memchr" memchr-2.2.1/LICENSE-MIT010064400017500000144000000020711274016737000126470ustar0000000000000000The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. memchr-2.2.1/README.md010064400017500000144000000056451343042341200124710ustar0000000000000000memchr ====== The `memchr` crate provides heavily optimized routines for searching bytes. [![Build status](https://api.travis-ci.org/BurntSushi/rust-memchr.png)](https://travis-ci.org/BurntSushi/rust-memchr) [![Build status](https://ci.appveyor.com/api/projects/status/8i9484t8l4w7uql0/branch/master?svg=true)](https://ci.appveyor.com/project/BurntSushi/rust-memchr/branch/master) [![](http://meritbadge.herokuapp.com/memchr)](https://crates.io/crates/memchr) Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). ### Documentation [https://docs.rs/memchr](https://docs.rs/memchr) ### Overview The `memchr` function is traditionally provided by libc, however, the performance of `memchr` can vary significantly depending on the specific implementation of libc that is used. They can range from manually tuned Assembly implementations (like that found in GNU's libc) all the way to non-vectorized C implementations (like that found in MUSL). To smooth out the differences between implementations of libc, at least on `x86_64` for Rust 1.27+, this crate provides its own implementation of `memchr` that should perform competitively with the one found in GNU's libc. The implementation is in pure Rust and has no dependency on a C compiler or an Assembler. Additionally, GNU libc also provides an extension, `memrchr`. This crate provides its own implementation of `memrchr` as well, on top of `memchr2`, `memchr3`, `memrchr2` and `memrchr3`. The difference between `memchr` and `memchr2` is that that `memchr2` permits finding all occurrences of two bytes instead of one. Similarly for `memchr3`. ### Compiling without the standard library memchr links to the standard library by default, but you can disable the `use_std` feature if you want to use it in a `#![no_std]` crate: ```toml [dependencies] memchr = { version = "2", default-features = false } ``` On x86 platforms, when the `use_std` feature is disabled, the SSE2 implementation of memchr will be used in compilers that support it. When `use_std` is enabled, the AVX implementation of memchr will be used if the CPU is determined to support it at runtime. ### Using libc `memchr` is a routine that is part of libc, although this crate does not use libc by default. Instead, it uses its own routines, which are either vectorized or generic fallback routines. In general, these should be competitive with what's in libc, although this has not been tested for all architectures. If using `memchr` from libc is desirable and a vectorized routine is not otherwise available in this crate, then enabling the `libc` feature will use libc's version of `memchr`. The rest of the functions in this crate, e.g., `memchr2` or `memrchr3`, are not a standard part of libc, so they will always use the implementations in this crate. One exception to this is `memrchr`, which is an extension commonly found on Linux. On Linux, `memrchr` is used in precisely the same scenario as `memchr`, as described above. memchr-2.2.1/UNLICENSE010064400017500000144000000022731274016737000124670ustar0000000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to memchr-2.2.1/build.rs010064400017500000144000000072741343042204300126560ustar0000000000000000use std::env; use std::ffi::OsString; use std::process::Command; fn main() { let version = match Version::read() { Ok(version) => version, Err(err) => { eprintln!("failed to parse `rustc --version`: {}", err); return; } }; enable_simd_optimizations(version); enable_libc(); } // This adds various simd cfgs if this compiler supports it. // // This can be disabled with RUSTFLAGS="--cfg memchr_disable_auto_simd", but // this is generally only intended for testing. fn enable_simd_optimizations(version: Version) { if is_env_set("CARGO_CFG_MEMCHR_DISABLE_AUTO_SIMD") { return; } if version < (Version { major: 1, minor: 27, patch: 0 }) { return; } println!("cargo:rustc-cfg=memchr_runtime_simd"); println!("cargo:rustc-cfg=memchr_runtime_sse2"); println!("cargo:rustc-cfg=memchr_runtime_sse42"); println!("cargo:rustc-cfg=memchr_runtime_avx"); } // This adds a `memchr_libc` cfg if and only if libc can be used, if no other // better option is available. // // This could be performed in the source code, but it's simpler to do it once // here and consolidate it into one cfg knob. // // Basically, we use libc only if its enabled and if we aren't targeting a // known bad platform. For example, wasm32 doesn't have a libc and the // performance of memchr on Windows is seemingly worse than the fallback // implementation. fn enable_libc() { const NO_ARCH: &'static [&'static str] = &["wasm32", "windows"]; const NO_ENV: &'static [&'static str] = &["sgx"]; if !is_feature_set("LIBC") { return; } let arch = match env::var("CARGO_CFG_TARGET_ARCH") { Err(_) => return, Ok(arch) => arch, }; let env = match env::var("CARGO_CFG_TARGET_ENV") { Err(_) => return, Ok(env) => env, }; if NO_ARCH.contains(&&*arch) || NO_ENV.contains(&&*env) { return; } println!("cargo:rustc-cfg=memchr_libc"); } fn is_feature_set(name: &str) -> bool { is_env_set(&format!("CARGO_FEATURE_{}", name)) } fn is_env_set(name: &str) -> bool { env::var_os(name).is_some() } #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)] struct Version { major: u32, minor: u32, patch: u32, } impl Version { fn read() -> Result { let rustc = env::var_os("RUSTC").unwrap_or(OsString::from("rustc")); let output = Command::new(&rustc) .arg("--version") .output() .unwrap() .stdout; Version::parse(&String::from_utf8(output).unwrap()) } fn parse(mut s: &str) -> Result { if !s.starts_with("rustc ") { return Err(format!("unrecognized version string: {}", s)); } s = &s["rustc ".len()..]; let parts: Vec<&str> = s.split(".").collect(); if parts.len() < 3 { return Err(format!("not enough version parts: {:?}", parts)); } let mut num = String::new(); for c in parts[0].chars() { if !c.is_digit(10) { break; } num.push(c); } let major = num.parse::().map_err(|e| e.to_string())?; num.clear(); for c in parts[1].chars() { if !c.is_digit(10) { break; } num.push(c); } let minor = num.parse::().map_err(|e| e.to_string())?; num.clear(); for c in parts[2].chars() { if !c.is_digit(10) { break; } num.push(c); } let patch = num.parse::().map_err(|e| e.to_string())?; Ok(Version { major, minor, patch }) } } memchr-2.2.1/src/c.rs010064400017500000144000000021331343041204400125550ustar0000000000000000// This module defines safe wrappers around memchr (POSIX) and memrchr (GNU // extension). #![allow(dead_code)] extern crate libc; use self::libc::{c_int, c_void, size_t}; pub fn memchr(needle: u8, haystack: &[u8]) -> Option { let p = unsafe { libc::memchr( haystack.as_ptr() as *const c_void, needle as c_int, haystack.len() as size_t, ) }; if p.is_null() { None } else { Some(p as usize - (haystack.as_ptr() as usize)) } } // memrchr is a GNU extension. We know it's available on Linux, so start there. #[cfg(target_os = "linux")] pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { // GNU's memrchr() will - unlike memchr() - error if haystack is empty. if haystack.is_empty() { return None; } let p = unsafe { libc::memrchr( haystack.as_ptr() as *const c_void, needle as c_int, haystack.len() as size_t, ) }; if p.is_null() { None } else { Some(p as usize - (haystack.as_ptr() as usize)) } } memchr-2.2.1/src/fallback.rs010064400017500000144000000270111343041200000140640ustar0000000000000000// This module defines pure Rust platform independent implementations of all // the memchr routines. We do our best to make them fast. Some of them may even // get auto-vectorized. use core::cmp; use core::ptr; use core::usize; #[cfg(target_pointer_width = "32")] const USIZE_BYTES: usize = 4; #[cfg(target_pointer_width = "64")] const USIZE_BYTES: usize = 8; // The number of bytes to loop at in one iteration of memchr/memrchr. const LOOP_SIZE: usize = 2 * USIZE_BYTES; /// Return `true` if `x` contains any zero byte. /// /// From *Matters Computational*, J. Arndt /// /// "The idea is to subtract one from each of the bytes and then look for /// bytes where the borrow propagated all the way to the most significant /// bit." #[inline(always)] fn contains_zero_byte(x: usize) -> bool { const LO_U64: u64 = 0x0101010101010101; const HI_U64: u64 = 0x8080808080808080; const LO_USIZE: usize = LO_U64 as usize; const HI_USIZE: usize = HI_U64 as usize; x.wrapping_sub(LO_USIZE) & !x & HI_USIZE != 0 } /// Repeat the given byte into a word size number. That is, every 8 bits /// is equivalent to the given byte. For example, if `b` is `\x4E` or /// `01001110` in binary, then the returned value on a 32-bit system would be: /// `01001110_01001110_01001110_01001110`. #[inline(always)] fn repeat_byte(b: u8) -> usize { (b as usize) * (usize::MAX / 255) } pub fn memchr(n1: u8, haystack: &[u8]) -> Option { let vn1 = repeat_byte(n1); let confirm = |byte| byte == n1; let loop_size = cmp::min(LOOP_SIZE, haystack.len()); let align = USIZE_BYTES - 1; let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; unsafe { if haystack.len() < USIZE_BYTES { return forward_search(start_ptr, end_ptr, ptr, confirm); } let chunk = read_unaligned_usize(ptr); if contains_zero_byte(chunk ^ vn1) { return forward_search(start_ptr, end_ptr, ptr, confirm); } ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align)); debug_assert!(ptr > start_ptr); debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr); while loop_size == LOOP_SIZE && ptr <= ptr_sub(end_ptr, loop_size) { debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); let a = *(ptr as *const usize); let b = *(ptr_add(ptr, USIZE_BYTES) as *const usize); let eqa = contains_zero_byte(a ^ vn1); let eqb = contains_zero_byte(b ^ vn1); if eqa || eqb { break; } ptr = ptr_add(ptr, LOOP_SIZE); } forward_search(start_ptr, end_ptr, ptr, confirm) } } /// Like `memchr`, but searches for two bytes instead of one. pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { let vn1 = repeat_byte(n1); let vn2 = repeat_byte(n2); let confirm = |byte| byte == n1 || byte == n2; let align = USIZE_BYTES - 1; let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; unsafe { if haystack.len() < USIZE_BYTES { return forward_search(start_ptr, end_ptr, ptr, confirm); } let chunk = read_unaligned_usize(ptr); let eq1 = contains_zero_byte(chunk ^ vn1); let eq2 = contains_zero_byte(chunk ^ vn2); if eq1 || eq2 { return forward_search(start_ptr, end_ptr, ptr, confirm); } ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align)); debug_assert!(ptr > start_ptr); debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr); while ptr <= ptr_sub(end_ptr, USIZE_BYTES) { debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); let chunk = *(ptr as *const usize); let eq1 = contains_zero_byte(chunk ^ vn1); let eq2 = contains_zero_byte(chunk ^ vn2); if eq1 || eq2 { break; } ptr = ptr_add(ptr, USIZE_BYTES); } forward_search(start_ptr, end_ptr, ptr, confirm) } } /// Like `memchr`, but searches for three bytes instead of one. pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { let vn1 = repeat_byte(n1); let vn2 = repeat_byte(n2); let vn3 = repeat_byte(n3); let confirm = |byte| byte == n1 || byte == n2 || byte == n3; let align = USIZE_BYTES - 1; let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; unsafe { if haystack.len() < USIZE_BYTES { return forward_search(start_ptr, end_ptr, ptr, confirm); } let chunk = read_unaligned_usize(ptr); let eq1 = contains_zero_byte(chunk ^ vn1); let eq2 = contains_zero_byte(chunk ^ vn2); let eq3 = contains_zero_byte(chunk ^ vn3); if eq1 || eq2 || eq3 { return forward_search(start_ptr, end_ptr, ptr, confirm); } ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align)); debug_assert!(ptr > start_ptr); debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr); while ptr <= ptr_sub(end_ptr, USIZE_BYTES) { debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); let chunk = *(ptr as *const usize); let eq1 = contains_zero_byte(chunk ^ vn1); let eq2 = contains_zero_byte(chunk ^ vn2); let eq3 = contains_zero_byte(chunk ^ vn3); if eq1 || eq2 || eq3 { break; } ptr = ptr_add(ptr, USIZE_BYTES); } forward_search(start_ptr, end_ptr, ptr, confirm) } } /// Return the last index matching the byte `x` in `text`. pub fn memrchr(n1: u8, haystack: &[u8]) -> Option { let vn1 = repeat_byte(n1); let confirm = |byte| byte == n1; let loop_size = cmp::min(LOOP_SIZE, haystack.len()); let align = USIZE_BYTES - 1; let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = end_ptr; unsafe { if haystack.len() < USIZE_BYTES { return reverse_search(start_ptr, end_ptr, ptr, confirm); } let chunk = read_unaligned_usize(ptr_sub(ptr, USIZE_BYTES)); if contains_zero_byte(chunk ^ vn1) { return reverse_search(start_ptr, end_ptr, ptr, confirm); } ptr = (end_ptr as usize & !align) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while loop_size == LOOP_SIZE && ptr >= ptr_add(start_ptr, loop_size) { debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); let a = *(ptr_sub(ptr, 2 * USIZE_BYTES) as *const usize); let b = *(ptr_sub(ptr, 1 * USIZE_BYTES) as *const usize); let eqa = contains_zero_byte(a ^ vn1); let eqb = contains_zero_byte(b ^ vn1); if eqa || eqb { break; } ptr = ptr_sub(ptr, loop_size); } reverse_search(start_ptr, end_ptr, ptr, confirm) } } /// Like `memrchr`, but searches for two bytes instead of one. pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { let vn1 = repeat_byte(n1); let vn2 = repeat_byte(n2); let confirm = |byte| byte == n1 || byte == n2; let align = USIZE_BYTES - 1; let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = end_ptr; unsafe { if haystack.len() < USIZE_BYTES { return reverse_search(start_ptr, end_ptr, ptr, confirm); } let chunk = read_unaligned_usize(ptr_sub(ptr, USIZE_BYTES)); let eq1 = contains_zero_byte(chunk ^ vn1); let eq2 = contains_zero_byte(chunk ^ vn2); if eq1 || eq2 { return reverse_search(start_ptr, end_ptr, ptr, confirm); } ptr = (end_ptr as usize & !align) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while ptr >= ptr_add(start_ptr, USIZE_BYTES) { debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); let chunk = *(ptr_sub(ptr, USIZE_BYTES) as *const usize); let eq1 = contains_zero_byte(chunk ^ vn1); let eq2 = contains_zero_byte(chunk ^ vn2); if eq1 || eq2 { break; } ptr = ptr_sub(ptr, USIZE_BYTES); } reverse_search(start_ptr, end_ptr, ptr, confirm) } } /// Like `memrchr`, but searches for three bytes instead of one. pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { let vn1 = repeat_byte(n1); let vn2 = repeat_byte(n2); let vn3 = repeat_byte(n3); let confirm = |byte| byte == n1 || byte == n2 || byte == n3; let align = USIZE_BYTES - 1; let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = end_ptr; unsafe { if haystack.len() < USIZE_BYTES { return reverse_search(start_ptr, end_ptr, ptr, confirm); } let chunk = read_unaligned_usize(ptr_sub(ptr, USIZE_BYTES)); let eq1 = contains_zero_byte(chunk ^ vn1); let eq2 = contains_zero_byte(chunk ^ vn2); let eq3 = contains_zero_byte(chunk ^ vn3); if eq1 || eq2 || eq3 { return reverse_search(start_ptr, end_ptr, ptr, confirm); } ptr = (end_ptr as usize & !align) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while ptr >= ptr_add(start_ptr, USIZE_BYTES) { debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES); let chunk = *(ptr_sub(ptr, USIZE_BYTES) as *const usize); let eq1 = contains_zero_byte(chunk ^ vn1); let eq2 = contains_zero_byte(chunk ^ vn2); let eq3 = contains_zero_byte(chunk ^ vn3); if eq1 || eq2 || eq3 { break; } ptr = ptr_sub(ptr, USIZE_BYTES); } reverse_search(start_ptr, end_ptr, ptr, confirm) } } #[inline(always)] unsafe fn forward_search bool>( start_ptr: *const u8, end_ptr: *const u8, mut ptr: *const u8, confirm: F, ) -> Option { debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr); while ptr < end_ptr { if confirm(*ptr) { return Some(sub(ptr, start_ptr)); } ptr = ptr.offset(1); } None } #[inline(always)] unsafe fn reverse_search bool>( start_ptr: *const u8, end_ptr: *const u8, mut ptr: *const u8, confirm: F, ) -> Option { debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr); while ptr > start_ptr { ptr = ptr.offset(-1); if confirm(*ptr) { return Some(sub(ptr, start_ptr)); } } None } /// Increment the given pointer by the given amount. unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 { debug_assert!(amt < ::core::isize::MAX as usize); ptr.offset(amt as isize) } /// Decrement the given pointer by the given amount. unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 { debug_assert!(amt < ::core::isize::MAX as usize); ptr.offset((amt as isize).wrapping_neg()) } unsafe fn read_unaligned_usize(ptr: *const u8) -> usize { let mut n: usize = 0; ptr::copy_nonoverlapping(ptr, &mut n as *mut _ as *mut u8, USIZE_BYTES); n } /// Subtract `b` from `a` and return the difference. `a` should be greater than /// or equal to `b`. fn sub(a: *const u8, b: *const u8) -> usize { debug_assert!(a >= b); (a as usize) - (b as usize) } memchr-2.2.1/src/iter.rs010064400017500000144000000103671335001377300133150ustar0000000000000000use {memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; macro_rules! iter_next { // Common code for the memchr iterators: // update haystack and position and produce the index // // self: &mut Self where Self is the iterator // search_result: Option which is the result of the corresponding // memchr function. // // Returns Option (the next iterator element) ($self_:expr, $search_result:expr) => { $search_result.map(move |index| { // split and take the remaining back half $self_.haystack = $self_.haystack.split_at(index + 1).1; let found_position = $self_.position + index; $self_.position = found_position + 1; found_position }) } } macro_rules! iter_next_back { ($self_:expr, $search_result:expr) => { $search_result.map(move |index| { // split and take the remaining front half $self_.haystack = $self_.haystack.split_at(index).0; $self_.position + index }) } } /// An iterator for `memchr`. pub struct Memchr<'a> { needle: u8, // The haystack to iterate over haystack: &'a [u8], // The index position: usize, } impl<'a> Memchr<'a> { /// Creates a new iterator that yields all positions of needle in haystack. #[inline] pub fn new(needle: u8, haystack: &[u8]) -> Memchr { Memchr { needle: needle, haystack: haystack, position: 0, } } } impl<'a> Iterator for Memchr<'a> { type Item = usize; #[inline] fn next(&mut self) -> Option { iter_next!(self, memchr(self.needle, self.haystack)) } #[inline] fn size_hint(&self) -> (usize, Option) { (0, Some(self.haystack.len())) } } impl<'a> DoubleEndedIterator for Memchr<'a> { #[inline] fn next_back(&mut self) -> Option { iter_next_back!(self, memrchr(self.needle, self.haystack)) } } /// An iterator for `memchr2`. pub struct Memchr2<'a> { needle1: u8, needle2: u8, // The haystack to iterate over haystack: &'a [u8], // The index position: usize, } impl<'a> Memchr2<'a> { /// Creates a new iterator that yields all positions of needle in haystack. #[inline] pub fn new(needle1: u8, needle2: u8, haystack: &[u8]) -> Memchr2 { Memchr2 { needle1: needle1, needle2: needle2, haystack: haystack, position: 0, } } } impl<'a> Iterator for Memchr2<'a> { type Item = usize; #[inline] fn next(&mut self) -> Option { iter_next!(self, memchr2(self.needle1, self.needle2, self.haystack)) } #[inline] fn size_hint(&self) -> (usize, Option) { (0, Some(self.haystack.len())) } } impl<'a> DoubleEndedIterator for Memchr2<'a> { #[inline] fn next_back(&mut self) -> Option { iter_next_back!( self, memrchr2(self.needle1, self.needle2, self.haystack) ) } } /// An iterator for `memchr3`. pub struct Memchr3<'a> { needle1: u8, needle2: u8, needle3: u8, // The haystack to iterate over haystack: &'a [u8], // The index position: usize, } impl<'a> Memchr3<'a> { /// Create a new `Memchr3` that's initialized to zero with a haystack #[inline] pub fn new( needle1: u8, needle2: u8, needle3: u8, haystack: &[u8], ) -> Memchr3 { Memchr3 { needle1: needle1, needle2: needle2, needle3: needle3, haystack: haystack, position: 0, } } } impl<'a> Iterator for Memchr3<'a> { type Item = usize; #[inline] fn next(&mut self) -> Option { iter_next!( self, memchr3(self.needle1, self.needle2, self.needle3, self.haystack) ) } #[inline] fn size_hint(&self) -> (usize, Option) { (0, Some(self.haystack.len())) } } impl<'a> DoubleEndedIterator for Memchr3<'a> { #[inline] fn next_back(&mut self) -> Option { iter_next_back!( self, memrchr3(self.needle1, self.needle2, self.needle3, self.haystack) ) } } memchr-2.2.1/src/lib.rs010064400017500000144000000213401343042204300131020ustar0000000000000000/*! The `memchr` crate provides heavily optimized routines for searching bytes. The `memchr` function is traditionally provided by libc, however, the performance of `memchr` can vary significantly depending on the specific implementation of libc that is used. They can range from manually tuned Assembly implementations (like that found in GNU's libc) all the way to non-vectorized C implementations (like that found in MUSL). To smooth out the differences between implementations of libc, at least on `x86_64` for Rust 1.27+, this crate provides its own implementation of `memchr` that should perform competitively with the one found in GNU's libc. The implementation is in pure Rust and has no dependency on a C compiler or an Assembler. Additionally, GNU libc also provides an extension, `memrchr`. This crate provides its own implementation of `memrchr` as well, on top of `memchr2`, `memchr3`, `memrchr2` and `memrchr3`. The difference between `memchr` and `memchr2` is that that `memchr2` permits finding all occurrences of two bytes instead of one. Similarly for `memchr3`. */ #![cfg_attr(not(feature = "use_std"), no_std)] #![deny(missing_docs)] #![doc(html_root_url = "https://docs.rs/memchr/2.0.0")] // Supporting 16-bit would be fine. If you need it, please submit a bug report // at https://github.com/BurntSushi/rust-memchr #[cfg(not(any(target_pointer_width = "32", target_pointer_width = "64")))] compile_error!("memchr currently not supported on non-32 or non-64 bit"); #[cfg(feature = "use_std")] extern crate core; #[cfg(test)] #[macro_use] extern crate quickcheck; use core::iter::Rev; pub use iter::{Memchr, Memchr2, Memchr3}; // N.B. If you're looking for the cfg knobs for libc, see build.rs. #[cfg(memchr_libc)] mod c; #[allow(dead_code)] mod fallback; mod iter; mod naive; #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] mod x86; #[cfg(test)] mod tests; /// An iterator over all occurrences of the needle in a haystack. #[inline] pub fn memchr_iter(needle: u8, haystack: &[u8]) -> Memchr { Memchr::new(needle, haystack) } /// An iterator over all occurrences of the needles in a haystack. #[inline] pub fn memchr2_iter( needle1: u8, needle2: u8, haystack: &[u8], ) -> Memchr2 { Memchr2::new(needle1, needle2, haystack) } /// An iterator over all occurrences of the needles in a haystack. #[inline] pub fn memchr3_iter( needle1: u8, needle2: u8, needle3: u8, haystack: &[u8], ) -> Memchr3 { Memchr3::new(needle1, needle2, needle3, haystack) } /// An iterator over all occurrences of the needle in a haystack, in reverse. #[inline] pub fn memrchr_iter(needle: u8, haystack: &[u8]) -> Rev { Memchr::new(needle, haystack).rev() } /// An iterator over all occurrences of the needles in a haystack, in reverse. #[inline] pub fn memrchr2_iter( needle1: u8, needle2: u8, haystack: &[u8], ) -> Rev { Memchr2::new(needle1, needle2, haystack).rev() } /// An iterator over all occurrences of the needles in a haystack, in reverse. #[inline] pub fn memrchr3_iter( needle1: u8, needle2: u8, needle3: u8, haystack: &[u8], ) -> Rev { Memchr3::new(needle1, needle2, needle3, haystack).rev() } /// Search for the first occurrence of a byte in a slice. /// /// This returns the index corresponding to the first occurrence of `needle` in /// `haystack`, or `None` if one is not found. /// /// While this is operationally the same as something like /// `haystack.iter().position(|&b| b == needle)`, `memchr` will use a highly /// optimized routine that can be up to an order of magnitude faster in some /// cases. /// /// # Example /// /// This shows how to find the first position of a byte in a byte string. /// /// ``` /// use memchr::memchr; /// /// let haystack = b"the quick brown fox"; /// assert_eq!(memchr(b'k', haystack), Some(8)); /// ``` #[inline] pub fn memchr(needle: u8, haystack: &[u8]) -> Option { #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] #[inline(always)] fn imp(n1: u8, haystack: &[u8]) -> Option { x86::memchr(n1, haystack) } #[cfg(all( memchr_libc, not(all(target_arch = "x86_64", memchr_runtime_simd)) ))] #[inline(always)] fn imp(n1: u8, haystack: &[u8]) -> Option { c::memchr(n1, haystack) } #[cfg(all( not(memchr_libc), not(all(target_arch = "x86_64", memchr_runtime_simd)) ))] #[inline(always)] fn imp(n1: u8, haystack: &[u8]) -> Option { fallback::memchr(n1, haystack) } if haystack.is_empty() { None } else { imp(needle, haystack) } } /// Like `memchr`, but searches for two bytes instead of one. #[inline] pub fn memchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option { #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] #[inline(always)] fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { x86::memchr2(n1, n2, haystack) } #[cfg(not(all(target_arch = "x86_64", memchr_runtime_simd)))] #[inline(always)] fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { fallback::memchr2(n1, n2, haystack) } if haystack.is_empty() { None } else { imp(needle1, needle2, haystack) } } /// Like `memchr`, but searches for three bytes instead of one. #[inline] pub fn memchr3( needle1: u8, needle2: u8, needle3: u8, haystack: &[u8], ) -> Option { #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] #[inline(always)] fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { x86::memchr3(n1, n2, n3, haystack) } #[cfg(not(all(target_arch = "x86_64", memchr_runtime_simd)))] #[inline(always)] fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { fallback::memchr3(n1, n2, n3, haystack) } if haystack.is_empty() { None } else { imp(needle1, needle2, needle3, haystack) } } /// Search for the last occurrence of a byte in a slice. /// /// This returns the index corresponding to the last occurrence of `needle` in /// `haystack`, or `None` if one is not found. /// /// While this is operationally the same as something like /// `haystack.iter().rposition(|&b| b == needle)`, `memrchr` will use a highly /// optimized routine that can be up to an order of magnitude faster in some /// cases. /// /// # Example /// /// This shows how to find the last position of a byte in a byte string. /// /// ``` /// use memchr::memrchr; /// /// let haystack = b"the quick brown fox"; /// assert_eq!(memrchr(b'o', haystack), Some(17)); /// ``` #[inline] pub fn memrchr(needle: u8, haystack: &[u8]) -> Option { #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] #[inline(always)] fn imp(n1: u8, haystack: &[u8]) -> Option { x86::memrchr(n1, haystack) } #[cfg(all( all(memchr_libc, target_os = "linux"), not(all(target_arch = "x86_64", memchr_runtime_simd)) ))] #[inline(always)] fn imp(n1: u8, haystack: &[u8]) -> Option { c::memrchr(n1, haystack) } #[cfg(all( not(all(memchr_libc, target_os = "linux")), not(all(target_arch = "x86_64", memchr_runtime_simd)) ))] #[inline(always)] fn imp(n1: u8, haystack: &[u8]) -> Option { fallback::memrchr(n1, haystack) } if haystack.is_empty() { None } else { imp(needle, haystack) } } /// Like `memrchr`, but searches for two bytes instead of one. #[inline] pub fn memrchr2(needle1: u8, needle2: u8, haystack: &[u8]) -> Option { #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] #[inline(always)] fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { x86::memrchr2(n1, n2, haystack) } #[cfg(not(all(target_arch = "x86_64", memchr_runtime_simd)))] #[inline(always)] fn imp(n1: u8, n2: u8, haystack: &[u8]) -> Option { fallback::memrchr2(n1, n2, haystack) } if haystack.is_empty() { None } else { imp(needle1, needle2, haystack) } } /// Like `memrchr`, but searches for three bytes instead of one. #[inline] pub fn memrchr3( needle1: u8, needle2: u8, needle3: u8, haystack: &[u8], ) -> Option { #[cfg(all(target_arch = "x86_64", memchr_runtime_simd))] #[inline(always)] fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { x86::memrchr3(n1, n2, n3, haystack) } #[cfg(not(all(target_arch = "x86_64", memchr_runtime_simd)))] #[inline(always)] fn imp(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { fallback::memrchr3(n1, n2, n3, haystack) } if haystack.is_empty() { None } else { imp(needle1, needle2, needle3, haystack) } } memchr-2.2.1/src/naive.rs010064400017500000144000000015551335001377300134530ustar0000000000000000#![allow(dead_code)] pub fn memchr(n1: u8, haystack: &[u8]) -> Option { haystack .iter() .position(|&b| b == n1) } pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { haystack .iter() .position(|&b| b == n1 || b == n2) } pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { haystack .iter() .position(|&b| b == n1 || b == n2 || b == n3) } pub fn memrchr(n1: u8, haystack: &[u8]) -> Option { haystack .iter() .rposition(|&b| b == n1) } pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { haystack .iter() .rposition(|&b| b == n1 || b == n2) } pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { haystack .iter() .rposition(|&b| b == n1 || b == n2 || b == n3) } memchr-2.2.1/src/tests/iter.rs010064400017500000144000000140141335001377300144500ustar0000000000000000use tests::memchr_tests; use {Memchr, Memchr2, Memchr3}; #[test] fn memchr1_iter() { for test in memchr_tests() { test.iter_one(false, Memchr::new); } } #[test] fn memchr2_iter() { for test in memchr_tests() { test.iter_two(false, Memchr2::new); } } #[test] fn memchr3_iter() { for test in memchr_tests() { test.iter_three(false, Memchr3::new); } } #[test] fn memrchr1_iter() { for test in memchr_tests() { test.iter_one(true, |n1, corpus| Memchr::new(n1, corpus).rev()); } } #[test] fn memrchr2_iter() { for test in memchr_tests() { test.iter_two(true, |n1, n2, corpus| { Memchr2::new(n1, n2, corpus).rev() }) } } #[test] fn memrchr3_iter() { for test in memchr_tests() { test.iter_three(true, |n1, n2, n3, corpus| { Memchr3::new(n1, n2, n3, corpus).rev() }) } } quickcheck! { fn qc_memchr_double_ended_iter( needle: u8, data: Vec, take_side: Vec ) -> bool { // make nonempty let mut take_side = take_side; if take_side.is_empty() { take_side.push(true) }; let iter = Memchr::new(needle, &data); let all_found = double_ended_take( iter, take_side.iter().cycle().cloned()); all_found.iter().cloned().eq(positions1(needle, &data)) } fn qc_memchr2_double_ended_iter( needle1: u8, needle2: u8, data: Vec, take_side: Vec ) -> bool { // make nonempty let mut take_side = take_side; if take_side.is_empty() { take_side.push(true) }; let iter = Memchr2::new(needle1, needle2, &data); let all_found = double_ended_take( iter, take_side.iter().cycle().cloned()); all_found.iter().cloned().eq(positions2(needle1, needle2, &data)) } fn qc_memchr3_double_ended_iter( needle1: u8, needle2: u8, needle3: u8, data: Vec, take_side: Vec ) -> bool { // make nonempty let mut take_side = take_side; if take_side.is_empty() { take_side.push(true) }; let iter = Memchr3::new(needle1, needle2, needle3, &data); let all_found = double_ended_take( iter, take_side.iter().cycle().cloned()); all_found .iter() .cloned() .eq(positions3(needle1, needle2, needle3, &data)) } fn qc_memchr1_iter(data: Vec) -> bool { let needle = 0; let answer = positions1(needle, &data); answer.eq(Memchr::new(needle, &data)) } fn qc_memchr1_rev_iter(data: Vec) -> bool { let needle = 0; let answer = positions1(needle, &data); answer.rev().eq(Memchr::new(needle, &data).rev()) } fn qc_memchr2_iter(data: Vec) -> bool { let needle1 = 0; let needle2 = 1; let answer = positions2(needle1, needle2, &data); answer.eq(Memchr2::new(needle1, needle2, &data)) } fn qc_memchr2_rev_iter(data: Vec) -> bool { let needle1 = 0; let needle2 = 1; let answer = positions2(needle1, needle2, &data); answer.rev().eq(Memchr2::new(needle1, needle2, &data).rev()) } fn qc_memchr3_iter(data: Vec) -> bool { let needle1 = 0; let needle2 = 1; let needle3 = 2; let answer = positions3(needle1, needle2, needle3, &data); answer.eq(Memchr3::new(needle1, needle2, needle3, &data)) } fn qc_memchr3_rev_iter(data: Vec) -> bool { let needle1 = 0; let needle2 = 1; let needle3 = 2; let answer = positions3(needle1, needle2, needle3, &data); answer.rev().eq(Memchr3::new(needle1, needle2, needle3, &data).rev()) } fn qc_memchr1_iter_size_hint(data: Vec) -> bool { // test that the size hint is within reasonable bounds let needle = 0; let mut iter = Memchr::new(needle, &data); let mut real_count = data .iter() .filter(|&&elt| elt == needle) .count(); while let Some(index) = iter.next() { real_count -= 1; let (lower, upper) = iter.size_hint(); assert!(lower <= real_count); assert!(upper.unwrap() >= real_count); assert!(upper.unwrap() <= data.len() - index); } true } } // take items from a DEI, taking front for each true and back for each false. // Return a vector with the concatenation of the fronts and the reverse of the // backs. fn double_ended_take(mut iter: I, take_side: J) -> Vec where I: DoubleEndedIterator, J: Iterator, { let mut found_front = Vec::new(); let mut found_back = Vec::new(); for take_front in take_side { if take_front { if let Some(pos) = iter.next() { found_front.push(pos); } else { break; } } else { if let Some(pos) = iter.next_back() { found_back.push(pos); } else { break; } }; } let mut all_found = found_front; all_found.extend(found_back.into_iter().rev()); all_found } // return an iterator of the 0-based indices of haystack that match the needle fn positions1<'a>( n1: u8, haystack: &'a [u8], ) -> Box + 'a> { let it = haystack .iter() .enumerate() .filter(move |&(_, &b)| b == n1) .map(|t| t.0); Box::new(it) } fn positions2<'a>( n1: u8, n2: u8, haystack: &'a [u8], ) -> Box + 'a> { let it = haystack .iter() .enumerate() .filter(move |&(_, &b)| b == n1 || b == n2) .map(|t| t.0); Box::new(it) } fn positions3<'a>( n1: u8, n2: u8, n3: u8, haystack: &'a [u8], ) -> Box + 'a> { let it = haystack .iter() .enumerate() .filter(move |&(_, &b)| b == n1 || b == n2 || b == n3) .map(|t| t.0); Box::new(it) } memchr-2.2.1/src/tests/memchr.rs010064400017500000144000000047411335001377300147660ustar0000000000000000use fallback; use naive; use {memchr, memchr2, memchr3, memrchr, memrchr2, memrchr3}; use tests::memchr_tests; #[test] fn memchr1_find() { for test in memchr_tests() { test.one(false, memchr); } } #[test] fn memchr1_fallback_find() { for test in memchr_tests() { test.one(false, fallback::memchr); } } #[test] fn memchr2_find() { for test in memchr_tests() { test.two(false, memchr2); } } #[test] fn memchr2_fallback_find() { for test in memchr_tests() { test.two(false, fallback::memchr2); } } #[test] fn memchr3_find() { for test in memchr_tests() { test.three(false, memchr3); } } #[test] fn memchr3_fallback_find() { for test in memchr_tests() { test.three(false, fallback::memchr3); } } #[test] fn memrchr1_find() { for test in memchr_tests() { test.one(true, memrchr); } } #[test] fn memrchr1_fallback_find() { for test in memchr_tests() { test.one(true, fallback::memrchr); } } #[test] fn memrchr2_find() { for test in memchr_tests() { test.two(true, memrchr2); } } #[test] fn memrchr2_fallback_find() { for test in memchr_tests() { test.two(true, fallback::memrchr2); } } #[test] fn memrchr3_find() { for test in memchr_tests() { test.three(true, memrchr3); } } #[test] fn memrchr3_fallback_find() { for test in memchr_tests() { test.three(true, fallback::memrchr3); } } quickcheck! { fn qc_memchr1_matches_naive(n1: u8, corpus: Vec) -> bool { memchr(n1, &corpus) == naive::memchr(n1, &corpus) } } quickcheck! { fn qc_memchr2_matches_naive(n1: u8, n2: u8, corpus: Vec) -> bool { memchr2(n1, n2, &corpus) == naive::memchr2(n1, n2, &corpus) } } quickcheck! { fn qc_memchr3_matches_naive( n1: u8, n2: u8, n3: u8, corpus: Vec ) -> bool { memchr3(n1, n2, n3, &corpus) == naive::memchr3(n1, n2, n3, &corpus) } } quickcheck! { fn qc_memrchr1_matches_naive(n1: u8, corpus: Vec) -> bool { memrchr(n1, &corpus) == naive::memrchr(n1, &corpus) } } quickcheck! { fn qc_memrchr2_matches_naive(n1: u8, n2: u8, corpus: Vec) -> bool { memrchr2(n1, n2, &corpus) == naive::memrchr2(n1, n2, &corpus) } } quickcheck! { fn qc_memrchr3_matches_naive( n1: u8, n2: u8, n3: u8, corpus: Vec ) -> bool { memrchr3(n1, n2, n3, &corpus) == naive::memrchr3(n1, n2, n3, &corpus) } } memchr-2.2.1/src/tests/mod.rs010064400017500000144000000313011342062110000142450ustar0000000000000000use std::iter::repeat; mod iter; mod memchr; #[cfg(target_endian = "little")] #[test] fn byte_order() { eprintln!("LITTLE ENDIAN"); } #[cfg(target_endian = "big")] #[test] fn byte_order() { eprintln!("BIG ENDIAN"); } /// Create a sequence of tests that should be run by memchr implementations. fn memchr_tests() -> Vec { let mut tests = Vec::new(); for statict in MEMCHR_TESTS { assert!(!statict.corpus.contains("%"), "% is not allowed in corpora"); assert!(!statict.corpus.contains("#"), "# is not allowed in corpora"); assert!(!statict.needles.contains(&b'%'), "% is an invalid needle"); assert!(!statict.needles.contains(&b'#'), "# is an invalid needle"); let t = MemchrTest { corpus: statict.corpus.to_string(), needles: statict.needles.to_vec(), positions: statict.positions.to_vec(), }; tests.push(t.clone()); tests.extend(t.expand()); } tests } /// A set of tests for memchr-like functions. /// /// These tests mostly try to cover the short string cases. We cover the longer /// string cases via the benchmarks (which are tests themselves), via /// quickcheck tests and via automatic expansion of each test case (by /// increasing the corpus size). Finally, we cover different alignment cases /// in the tests by varying the starting point of the slice. const MEMCHR_TESTS: &[MemchrTestStatic] = &[ // one needle (applied to memchr + memchr2 + memchr3) MemchrTestStatic { corpus: "a", needles: &[b'a'], positions: &[0], }, MemchrTestStatic { corpus: "aa", needles: &[b'a'], positions: &[0, 1], }, MemchrTestStatic { corpus: "aaa", needles: &[b'a'], positions: &[0, 1, 2], }, MemchrTestStatic { corpus: "", needles: &[b'a'], positions: &[], }, MemchrTestStatic { corpus: "z", needles: &[b'a'], positions: &[], }, MemchrTestStatic { corpus: "zz", needles: &[b'a'], positions: &[], }, MemchrTestStatic { corpus: "zza", needles: &[b'a'], positions: &[2], }, MemchrTestStatic { corpus: "zaza", needles: &[b'a'], positions: &[1, 3], }, MemchrTestStatic { corpus: "zzza", needles: &[b'a'], positions: &[3], }, MemchrTestStatic { corpus: "\x00a", needles: &[b'a'], positions: &[1], }, MemchrTestStatic { corpus: "\x00", needles: &[b'\x00'], positions: &[0], }, MemchrTestStatic { corpus: "\x00\x00", needles: &[b'\x00'], positions: &[0, 1], }, MemchrTestStatic { corpus: "\x00a\x00", needles: &[b'\x00'], positions: &[0, 2], }, MemchrTestStatic { corpus: "zzzzzzzzzzzzzzzza", needles: &[b'a'], positions: &[16], }, MemchrTestStatic { corpus: "zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzza", needles: &[b'a'], positions: &[32], }, // two needles (applied to memchr2 + memchr3) MemchrTestStatic { corpus: "az", needles: &[b'a', b'z'], positions: &[0, 1], }, MemchrTestStatic { corpus: "az", needles: &[b'a', b'z'], positions: &[0, 1], }, MemchrTestStatic { corpus: "az", needles: &[b'x', b'y'], positions: &[], }, MemchrTestStatic { corpus: "az", needles: &[b'a', b'y'], positions: &[0], }, MemchrTestStatic { corpus: "az", needles: &[b'x', b'z'], positions: &[1], }, MemchrTestStatic { corpus: "yyyyaz", needles: &[b'a', b'z'], positions: &[4, 5], }, MemchrTestStatic { corpus: "yyyyaz", needles: &[b'z', b'a'], positions: &[4, 5], }, // three needles (applied to memchr3) MemchrTestStatic { corpus: "xyz", needles: &[b'x', b'y', b'z'], positions: &[0, 1, 2], }, MemchrTestStatic { corpus: "zxy", needles: &[b'x', b'y', b'z'], positions: &[0, 1, 2], }, MemchrTestStatic { corpus: "zxy", needles: &[b'x', b'a', b'z'], positions: &[0, 1], }, MemchrTestStatic { corpus: "zxy", needles: &[b't', b'a', b'z'], positions: &[0], }, MemchrTestStatic { corpus: "yxz", needles: &[b't', b'a', b'z'], positions: &[2], }, ]; /// A description of a test on a memchr like function. #[derive(Clone, Debug)] struct MemchrTest { /// The thing to search. We use `&str` instead of `&[u8]` because they /// are nicer to write in tests, and we don't miss much since memchr /// doesn't care about UTF-8. /// /// Corpora cannot contain either '%' or '#'. We use these bytes when /// expanding test cases into many test cases, and we assume they are not /// used. If they are used, `memchr_tests` will panic. corpus: String, /// The needles to search for. This is intended to be an "alternation" of /// needles. The number of needles may cause this test to be skipped for /// some memchr variants. For example, a test with 2 needles cannot be used /// to test `memchr`, but can be used to test `memchr2` and `memchr3`. /// However, a test with only 1 needle can be used to test all of `memchr`, /// `memchr2` and `memchr3`. We achieve this by filling in the needles with /// bytes that we never used in the corpus (such as '#'). needles: Vec, /// The positions expected to match for all of the needles. positions: Vec, } /// Like MemchrTest, but easier to define as a constant. #[derive(Clone, Debug)] struct MemchrTestStatic { corpus: &'static str, needles: &'static [u8], positions: &'static [usize], } impl MemchrTest { fn one Option>( &self, reverse: bool, f: F, ) { let needles = match self.needles(1) { None => return, Some(needles) => needles, }; // We test different alignments here. Since some implementations use // AVX2, which can read 32 bytes at a time, we test at least that. // Moreover, with loop unrolling, we sometimes process 64 (sse2) or 128 // (avx) bytes at a time, so we include that in our offsets as well. // // You might think this would cause most needles to not be found, but // we actually expand our tests to include corpus sizes all the way up // to >500 bytes, so we should exericse most branches. for align in 0..130 { let corpus = self.corpus(align); assert_eq!( self.positions(align, reverse).get(0).cloned(), f(needles[0], corpus.as_bytes()), "search for {:?} failed in: {:?} (len: {}, alignment: {})", needles[0] as char, corpus, corpus.len(), align ); } } fn two Option>( &self, reverse: bool, f: F, ) { let needles = match self.needles(2) { None => return, Some(needles) => needles, }; for align in 0..130 { let corpus = self.corpus(align); assert_eq!( self.positions(align, reverse).get(0).cloned(), f(needles[0], needles[1], corpus.as_bytes()), "search for {:?}|{:?} failed in: {:?} \ (len: {}, alignment: {})", needles[0] as char, needles[1] as char, corpus, corpus.len(), align ); } } fn three Option>( &self, reverse: bool, f: F, ) { let needles = match self.needles(3) { None => return, Some(needles) => needles, }; for align in 0..130 { let corpus = self.corpus(align); assert_eq!( self.positions(align, reverse).get(0).cloned(), f(needles[0], needles[1], needles[2], corpus.as_bytes()), "search for {:?}|{:?}|{:?} failed in: {:?} \ (len: {}, alignment: {})", needles[0] as char, needles[1] as char, needles[2] as char, corpus, corpus.len(), align ); } } fn iter_one<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, &'a [u8]) -> I, I: Iterator { if let Some(ns) = self.needles(1) { self.iter(reverse, f(ns[0], self.corpus.as_bytes())); } } fn iter_two<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, &'a [u8]) -> I, I: Iterator { if let Some(ns) = self.needles(2) { self.iter(reverse, f(ns[0], ns[1], self.corpus.as_bytes())); } } fn iter_three<'a, I, F>(&'a self, reverse: bool, f: F) where F: FnOnce(u8, u8, u8, &'a [u8]) -> I, I: Iterator { if let Some(ns) = self.needles(3) { self.iter(reverse, f(ns[0], ns[1], ns[2], self.corpus.as_bytes())); } } /// Test that the positions yielded by the given iterator match the /// positions in this test. If reverse is true, then reverse the positions /// before comparing them. fn iter>(&self, reverse: bool, it: I) { assert_eq!( self.positions(0, reverse), it.collect::>(), r"search for {:?} failed in: {:?}", self.needles.iter().map(|&b| b as char).collect::>(), self.corpus ); } /// Expand this test into many variations of the same test. /// /// In particular, this will generate more tests with larger corpus sizes. /// The expected positions are updated to maintain the integrity of the /// test. /// /// This is important in testing a memchr implementation, because there are /// often different cases depending on the length of the corpus. /// /// Note that we extend the corpus by adding `%` bytes, which we /// don't otherwise use as a needle. fn expand(&self) -> Vec { let mut more = Vec::new(); // Add bytes to the start of the corpus. for i in 1..515 { let mut t = self.clone(); let mut new_corpus: String = repeat('%').take(i).collect(); new_corpus.push_str(&t.corpus); t.corpus = new_corpus; t.positions = t.positions.into_iter().map(|p| p + i).collect(); more.push(t); } // Add bytes to the end of the corpus. for i in 1..515 { let mut t = self.clone(); let mut padding: String = repeat('%').take(i).collect(); t.corpus.push_str(&padding); more.push(t); } more } /// Return the corpus at the given alignment. /// /// If the alignment exceeds the length of the corpus, then this returns /// an empty slice. fn corpus(&self, align: usize) -> &str { self.corpus.get(align..).unwrap_or("") } /// Return exactly `count` needles from this test. If this test has less /// than `count` needles, then add `#` until the number of needles /// matches `count`. If this test has more than `count` needles, then /// return `None` (because there is no way to use this test data for a /// search using fewer needles). fn needles(&self, count: usize) -> Option> { if self.needles.len() > count { return None; } let mut needles = self.needles.to_vec(); for _ in needles.len()..count { // we assume # is never used in tests. needles.push(b'#'); } Some(needles) } /// Return the positions in this test, reversed if `reverse` is true. /// /// If alignment is given, then all positions greater than or equal to that /// alignment are offset by the alignment. Positions less than the /// alignment are dropped. fn positions(&self, align: usize, reverse: bool) -> Vec { let positions = if reverse { let mut positions = self.positions.to_vec(); positions.reverse(); positions } else { self.positions.to_vec() }; positions .into_iter() .filter(|&p| p >= align) .map(|p| p - align) .collect() } } memchr-2.2.1/src/x86/avx.rs010064400017500000144000000614421351045146000135720ustar0000000000000000use core::arch::x86_64::*; use core::cmp; use core::mem::size_of; use x86::sse2; const VECTOR_SIZE: usize = size_of::<__m256i>(); const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; // The number of bytes to loop at in one iteration of memchr/memrchr. const LOOP_SIZE: usize = 4 * VECTOR_SIZE; // The number of bytes to loop at in one iteration of memchr2/memrchr2 and // memchr3/memrchr3. There was no observable difference between 128 and 64 // bytes in benchmarks. memchr3 in particular only gets a very slight speed up // from the loop unrolling. const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; #[target_feature(enable = "avx2")] pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option { // For a high level explanation for how this algorithm works, see the // sse2 implementation. The avx implementation here is the same, but with // 256-bit vectors instead of 128-bit vectors. let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; if haystack.len() < VECTOR_SIZE { // For small haystacks, defer to the SSE2 implementation. Codegen // suggests this completely avoids touching the AVX vectors. return sse2::memchr(n1, haystack); } let vn1 = _mm256_set1_epi8(n1 as i8); let loop_size = cmp::min(LOOP_SIZE, haystack.len()); if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); let a = _mm256_load_si256(ptr as *const __m256i); let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); let eqa = _mm256_cmpeq_epi8(vn1, a); let eqb = _mm256_cmpeq_epi8(vn1, b); let eqc = _mm256_cmpeq_epi8(vn1, c); let eqd = _mm256_cmpeq_epi8(vn1, d); let or1 = _mm256_or_si256(eqa, eqb); let or2 = _mm256_or_si256(eqc, eqd); let or3 = _mm256_or_si256(or1, or2); if _mm256_movemask_epi8(or3) != 0 { let mut at = sub(ptr, start_ptr); let mask = _mm256_movemask_epi8(eqa); if mask != 0 { return Some(at + forward_pos(mask)); } at += VECTOR_SIZE; let mask = _mm256_movemask_epi8(eqb); if mask != 0 { return Some(at + forward_pos(mask)); } at += VECTOR_SIZE; let mask = _mm256_movemask_epi8(eqc); if mask != 0 { return Some(at + forward_pos(mask)); } at += VECTOR_SIZE; let mask = _mm256_movemask_epi8(eqd); debug_assert!(mask != 0); return Some(at + forward_pos(mask)); } ptr = ptr.add(loop_size); } while ptr <= end_ptr.sub(VECTOR_SIZE) { debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } ptr = ptr.add(VECTOR_SIZE); } if ptr < end_ptr { debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); return forward_search1(start_ptr, end_ptr, ptr, vn1); } None } #[target_feature(enable = "avx2")] pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { let vn1 = _mm256_set1_epi8(n1 as i8); let vn2 = _mm256_set1_epi8(n2 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE2, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; if haystack.len() < VECTOR_SIZE { while ptr < end_ptr { if *ptr == n1 || *ptr == n2 { return Some(sub(ptr, start_ptr)); } ptr = ptr.offset(1); } return None; } if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); let a = _mm256_load_si256(ptr as *const __m256i); let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); let eqa1 = _mm256_cmpeq_epi8(vn1, a); let eqb1 = _mm256_cmpeq_epi8(vn1, b); let eqa2 = _mm256_cmpeq_epi8(vn2, a); let eqb2 = _mm256_cmpeq_epi8(vn2, b); let or1 = _mm256_or_si256(eqa1, eqb1); let or2 = _mm256_or_si256(eqa2, eqb2); let or3 = _mm256_or_si256(or1, or2); if _mm256_movemask_epi8(or3) != 0 { let mut at = sub(ptr, start_ptr); let mask1 = _mm256_movemask_epi8(eqa1); let mask2 = _mm256_movemask_epi8(eqa2); if mask1 != 0 || mask2 != 0 { return Some(at + forward_pos2(mask1, mask2)); } at += VECTOR_SIZE; let mask1 = _mm256_movemask_epi8(eqb1); let mask2 = _mm256_movemask_epi8(eqb2); return Some(at + forward_pos2(mask1, mask2)); } ptr = ptr.add(loop_size); } while ptr <= end_ptr.sub(VECTOR_SIZE) { if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } ptr = ptr.add(VECTOR_SIZE); } if ptr < end_ptr { debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); } None } #[target_feature(enable = "avx2")] pub unsafe fn memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8] ) -> Option { let vn1 = _mm256_set1_epi8(n1 as i8); let vn2 = _mm256_set1_epi8(n2 as i8); let vn3 = _mm256_set1_epi8(n3 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE2, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; if haystack.len() < VECTOR_SIZE { while ptr < end_ptr { if *ptr == n1 || *ptr == n2 || *ptr == n3 { return Some(sub(ptr, start_ptr)); } ptr = ptr.offset(1); } return None; } if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); let a = _mm256_load_si256(ptr as *const __m256i); let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); let eqa1 = _mm256_cmpeq_epi8(vn1, a); let eqb1 = _mm256_cmpeq_epi8(vn1, b); let eqa2 = _mm256_cmpeq_epi8(vn2, a); let eqb2 = _mm256_cmpeq_epi8(vn2, b); let eqa3 = _mm256_cmpeq_epi8(vn3, a); let eqb3 = _mm256_cmpeq_epi8(vn3, b); let or1 = _mm256_or_si256(eqa1, eqb1); let or2 = _mm256_or_si256(eqa2, eqb2); let or3 = _mm256_or_si256(eqa3, eqb3); let or4 = _mm256_or_si256(or1, or2); let or5 = _mm256_or_si256(or3, or4); if _mm256_movemask_epi8(or5) != 0 { let mut at = sub(ptr, start_ptr); let mask1 = _mm256_movemask_epi8(eqa1); let mask2 = _mm256_movemask_epi8(eqa2); let mask3 = _mm256_movemask_epi8(eqa3); if mask1 != 0 || mask2 != 0 || mask3 != 0 { return Some(at + forward_pos3(mask1, mask2, mask3)); } at += VECTOR_SIZE; let mask1 = _mm256_movemask_epi8(eqb1); let mask2 = _mm256_movemask_epi8(eqb2); let mask3 = _mm256_movemask_epi8(eqb3); if mask1 != 0 || mask2 != 0 || mask3 != 0 { return Some(at + forward_pos3(mask1, mask2, mask3)); } at += VECTOR_SIZE; let mask1 = _mm256_movemask_epi8(eqb1); let mask2 = _mm256_movemask_epi8(eqb2); let mask3 = _mm256_movemask_epi8(eqb3); return Some(at + forward_pos3(mask1, mask2, mask3)); } ptr = ptr.add(loop_size); } while ptr <= end_ptr.sub(VECTOR_SIZE) { if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } ptr = ptr.add(VECTOR_SIZE); } if ptr < end_ptr { debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); } None } #[target_feature(enable = "avx2")] pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option { let vn1 = _mm256_set1_epi8(n1 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = end_ptr; if haystack.len() < VECTOR_SIZE { while ptr > start_ptr { ptr = ptr.offset(-1); if *ptr == n1 { return Some(sub(ptr, start_ptr)); } } return None; } ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); ptr = ptr.sub(loop_size); let a = _mm256_load_si256(ptr as *const __m256i); let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); let c = _mm256_load_si256(ptr.add(2 * VECTOR_SIZE) as *const __m256i); let d = _mm256_load_si256(ptr.add(3 * VECTOR_SIZE) as *const __m256i); let eqa = _mm256_cmpeq_epi8(vn1, a); let eqb = _mm256_cmpeq_epi8(vn1, b); let eqc = _mm256_cmpeq_epi8(vn1, c); let eqd = _mm256_cmpeq_epi8(vn1, d); let or1 = _mm256_or_si256(eqa, eqb); let or2 = _mm256_or_si256(eqc, eqd); let or3 = _mm256_or_si256(or1, or2); if _mm256_movemask_epi8(or3) != 0 { let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); let mask = _mm256_movemask_epi8(eqd); if mask != 0 { return Some(at + reverse_pos(mask)); } at -= VECTOR_SIZE; let mask = _mm256_movemask_epi8(eqc); if mask != 0 { return Some(at + reverse_pos(mask)); } at -= VECTOR_SIZE; let mask = _mm256_movemask_epi8(eqb); if mask != 0 { return Some(at + reverse_pos(mask)); } at -= VECTOR_SIZE; let mask = _mm256_movemask_epi8(eqa); debug_assert!(mask != 0); return Some(at + reverse_pos(mask)); } } while ptr >= start_ptr.add(VECTOR_SIZE) { ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } } if ptr > start_ptr { debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); } None } #[target_feature(enable = "avx2")] pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { let vn1 = _mm256_set1_epi8(n1 as i8); let vn2 = _mm256_set1_epi8(n2 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE2, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = end_ptr; if haystack.len() < VECTOR_SIZE { while ptr > start_ptr { ptr = ptr.offset(-1); if *ptr == n1 || *ptr == n2 { return Some(sub(ptr, start_ptr)); } } return None; } ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); ptr = ptr.sub(loop_size); let a = _mm256_load_si256(ptr as *const __m256i); let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); let eqa1 = _mm256_cmpeq_epi8(vn1, a); let eqb1 = _mm256_cmpeq_epi8(vn1, b); let eqa2 = _mm256_cmpeq_epi8(vn2, a); let eqb2 = _mm256_cmpeq_epi8(vn2, b); let or1 = _mm256_or_si256(eqa1, eqb1); let or2 = _mm256_or_si256(eqa2, eqb2); let or3 = _mm256_or_si256(or1, or2); if _mm256_movemask_epi8(or3) != 0 { let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); let mask1 = _mm256_movemask_epi8(eqb1); let mask2 = _mm256_movemask_epi8(eqb2); if mask1 != 0 || mask2 != 0 { return Some(at + reverse_pos2(mask1, mask2)); } at -= VECTOR_SIZE; let mask1 = _mm256_movemask_epi8(eqa1); let mask2 = _mm256_movemask_epi8(eqa2); return Some(at + reverse_pos2(mask1, mask2)); } } while ptr >= start_ptr.add(VECTOR_SIZE) { ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } } if ptr > start_ptr { debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); } None } #[target_feature(enable = "avx2")] pub unsafe fn memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option { let vn1 = _mm256_set1_epi8(n1 as i8); let vn2 = _mm256_set1_epi8(n2 as i8); let vn3 = _mm256_set1_epi8(n3 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE2, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = end_ptr; if haystack.len() < VECTOR_SIZE { while ptr > start_ptr { ptr = ptr.offset(-1); if *ptr == n1 || *ptr == n2 || *ptr == n3 { return Some(sub(ptr, start_ptr)); } } return None; } ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); ptr = ptr.sub(loop_size); let a = _mm256_load_si256(ptr as *const __m256i); let b = _mm256_load_si256(ptr.add(VECTOR_SIZE) as *const __m256i); let eqa1 = _mm256_cmpeq_epi8(vn1, a); let eqb1 = _mm256_cmpeq_epi8(vn1, b); let eqa2 = _mm256_cmpeq_epi8(vn2, a); let eqb2 = _mm256_cmpeq_epi8(vn2, b); let eqa3 = _mm256_cmpeq_epi8(vn3, a); let eqb3 = _mm256_cmpeq_epi8(vn3, b); let or1 = _mm256_or_si256(eqa1, eqb1); let or2 = _mm256_or_si256(eqa2, eqb2); let or3 = _mm256_or_si256(eqa3, eqb3); let or4 = _mm256_or_si256(or1, or2); let or5 = _mm256_or_si256(or3, or4); if _mm256_movemask_epi8(or5) != 0 { let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); let mask1 = _mm256_movemask_epi8(eqb1); let mask2 = _mm256_movemask_epi8(eqb2); let mask3 = _mm256_movemask_epi8(eqb3); if mask1 != 0 || mask2 != 0 || mask3 != 0 { return Some(at + reverse_pos3(mask1, mask2, mask3)); } at -= VECTOR_SIZE; let mask1 = _mm256_movemask_epi8(eqa1); let mask2 = _mm256_movemask_epi8(eqa2); let mask3 = _mm256_movemask_epi8(eqa3); return Some(at + reverse_pos3(mask1, mask2, mask3)); } } while ptr >= start_ptr.add(VECTOR_SIZE) { ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } } if ptr > start_ptr { debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); } None } #[target_feature(enable = "avx2")] unsafe fn forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm256_loadu_si256(ptr as *const __m256i); let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(chunk, vn1)); if mask != 0 { Some(sub(ptr, start_ptr) + forward_pos(mask)) } else { None } } #[target_feature(enable = "avx2")] unsafe fn forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm256_loadu_si256(ptr as *const __m256i); let eq1 = _mm256_cmpeq_epi8(chunk, vn1); let eq2 = _mm256_cmpeq_epi8(chunk, vn2); if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { let mask1 = _mm256_movemask_epi8(eq1); let mask2 = _mm256_movemask_epi8(eq2); Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) } else { None } } #[target_feature(enable = "avx2")] unsafe fn forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm256_loadu_si256(ptr as *const __m256i); let eq1 = _mm256_cmpeq_epi8(chunk, vn1); let eq2 = _mm256_cmpeq_epi8(chunk, vn2); let eq3 = _mm256_cmpeq_epi8(chunk, vn3); let or = _mm256_or_si256(eq1, eq2); if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { let mask1 = _mm256_movemask_epi8(eq1); let mask2 = _mm256_movemask_epi8(eq2); let mask3 = _mm256_movemask_epi8(eq3); Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) } else { None } } #[target_feature(enable = "avx2")] unsafe fn reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm256_loadu_si256(ptr as *const __m256i); let mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(vn1, chunk)); if mask != 0 { Some(sub(ptr, start_ptr) + reverse_pos(mask)) } else { None } } #[target_feature(enable = "avx2")] unsafe fn reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm256_loadu_si256(ptr as *const __m256i); let eq1 = _mm256_cmpeq_epi8(chunk, vn1); let eq2 = _mm256_cmpeq_epi8(chunk, vn2); if _mm256_movemask_epi8(_mm256_or_si256(eq1, eq2)) != 0 { let mask1 = _mm256_movemask_epi8(eq1); let mask2 = _mm256_movemask_epi8(eq2); Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) } else { None } } #[target_feature(enable = "avx2")] unsafe fn reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m256i, vn2: __m256i, vn3: __m256i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm256_loadu_si256(ptr as *const __m256i); let eq1 = _mm256_cmpeq_epi8(chunk, vn1); let eq2 = _mm256_cmpeq_epi8(chunk, vn2); let eq3 = _mm256_cmpeq_epi8(chunk, vn3); let or = _mm256_or_si256(eq1, eq2); if _mm256_movemask_epi8(_mm256_or_si256(or, eq3)) != 0 { let mask1 = _mm256_movemask_epi8(eq1); let mask2 = _mm256_movemask_epi8(eq2); let mask3 = _mm256_movemask_epi8(eq3); Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) } else { None } } /// Compute the position of the first matching byte from the given mask. The /// position returned is always in the range [0, 31]. /// /// The mask given is expected to be the result of _mm256_movemask_epi8. fn forward_pos(mask: i32) -> usize { // We are dealing with little endian here, where the most significant byte // is at a higher address. That means the least significant bit that is set // corresponds to the position of our first matching byte. That position // corresponds to the number of zeros after the least significant bit. mask.trailing_zeros() as usize } /// Compute the position of the first matching byte from the given masks. The /// position returned is always in the range [0, 31]. Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm256_movemask_epi8, /// where at least one of the masks is non-zero (i.e., indicates a match). fn forward_pos2(mask1: i32, mask2: i32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0); forward_pos(mask1 | mask2) } /// Compute the position of the first matching byte from the given masks. The /// position returned is always in the range [0, 31]. Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm256_movemask_epi8, /// where at least one of the masks is non-zero (i.e., indicates a match). fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); forward_pos(mask1 | mask2 | mask3) } /// Compute the position of the last matching byte from the given mask. The /// position returned is always in the range [0, 31]. /// /// The mask given is expected to be the result of _mm256_movemask_epi8. fn reverse_pos(mask: i32) -> usize { // We are dealing with little endian here, where the most significant byte // is at a higher address. That means the most significant bit that is set // corresponds to the position of our last matching byte. The position from // the end of the mask is therefore the number of leading zeros in a 32 // bit integer, and the position from the start of the mask is therefore // 32 - (leading zeros) - 1. VECTOR_SIZE - (mask as u32).leading_zeros() as usize - 1 } /// Compute the position of the last matching byte from the given masks. The /// position returned is always in the range [0, 31]. Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm256_movemask_epi8, /// where at least one of the masks is non-zero (i.e., indicates a match). fn reverse_pos2(mask1: i32, mask2: i32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0); reverse_pos(mask1 | mask2) } /// Compute the position of the last matching byte from the given masks. The /// position returned is always in the range [0, 31]. Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm256_movemask_epi8, /// where at least one of the masks is non-zero (i.e., indicates a match). fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); reverse_pos(mask1 | mask2 | mask3) } /// Subtract `b` from `a` and return the difference. `a` should be greater than /// or equal to `b`. fn sub(a: *const u8, b: *const u8) -> usize { debug_assert!(a >= b); (a as usize) - (b as usize) } memchr-2.2.1/src/x86/mod.rs010064400017500000144000000077661351045146000135640ustar0000000000000000use fallback; // We only use AVX when we can detect at runtime whether it's available, which // requires std. #[cfg(feature = "use_std")] mod avx; mod sse2; // This macro employs a gcc-like "ifunc" trick where by upon first calling // `memchr` (for example), CPU feature detection will be performed at runtime // to determine the best implementation to use. After CPU feature detection // is done, we replace `memchr`'s function pointer with the selection. Upon // subsequent invocations, the CPU-specific routine is invoked directly, which // skips the CPU feature detection and subsequent branch that's required. // // While this typically doesn't matter for rare occurrences or when used on // larger haystacks, `memchr` can be called in tight loops where the overhead // of this branch can actually add up *and is measurable*. This trick was // necessary to bring this implementation up to glibc's speeds for the 'tiny' // benchmarks, for example. // // At some point, I expect the Rust ecosystem will get a nice macro for doing // exactly this, at which point, we can replace our hand-jammed version of it. // // N.B. The ifunc strategy does prevent function inlining of course, but on // modern CPUs, you'll probably end up with the AVX2 implementation, which // probably can't be inlined anyway---unless you've compiled your entire // program with AVX2 enabled. However, even then, the various memchr // implementations aren't exactly small, so inlining might not help anyway! #[cfg(feature = "use_std")] macro_rules! ifunc { ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ use std::mem; use std::sync::atomic::{AtomicPtr, Ordering}; type FnRaw = *mut (); static FN: AtomicPtr<()> = AtomicPtr::new(detect as FnRaw); fn detect($($needle: u8),+, haystack: &[u8]) -> Option { let fun = if cfg!(memchr_runtime_avx) && is_x86_feature_detected!("avx2") { avx::$name as FnRaw } else if cfg!(memchr_runtime_sse2) { sse2::$name as FnRaw } else { fallback::$name as FnRaw }; FN.store(fun as FnRaw, Ordering::Relaxed); unsafe { mem::transmute::(fun)($($needle),+, haystack) } } unsafe { let fun = FN.load(Ordering::Relaxed); mem::transmute::(fun)($($needle),+, $haystack) } }} } // When std isn't available to provide runtime CPU feature detection, or if // runtime CPU feature detection has been explicitly disabled, then just call // our optimized SSE2 routine directly. SSE2 is avalbale on all x86_64 targets, // so no CPU feature detection is necessary. #[cfg(not(feature = "use_std"))] macro_rules! ifunc { ($fnty:ty, $name:ident, $haystack:ident, $($needle:ident),+) => {{ if cfg!(memchr_runtime_sse2) { unsafe { sse2::$name($($needle),+, $haystack) } } else { fallback::$name($($needle),+, $haystack) } }} } #[inline(always)] pub fn memchr(n1: u8, haystack: &[u8]) -> Option { ifunc!(fn(u8, &[u8]) -> Option, memchr, haystack, n1) } #[inline(always)] pub fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { ifunc!(fn(u8, u8, &[u8]) -> Option, memchr2, haystack, n1, n2) } #[inline(always)] pub fn memchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { ifunc!(fn(u8, u8, u8, &[u8]) -> Option, memchr3, haystack, n1, n2, n3) } #[inline(always)] pub fn memrchr(n1: u8, haystack: &[u8]) -> Option { ifunc!(fn(u8, &[u8]) -> Option, memrchr, haystack, n1) } #[inline(always)] pub fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { ifunc!(fn(u8, u8, &[u8]) -> Option, memrchr2, haystack, n1, n2) } #[inline(always)] pub fn memrchr3(n1: u8, n2: u8, n3: u8, haystack: &[u8]) -> Option { ifunc!(fn(u8, u8, u8, &[u8]) -> Option, memrchr3, haystack, n1, n2, n3) } memchr-2.2.1/src/x86/sse2.rs010064400017500000144000000722141351045146000136470ustar0000000000000000use core::arch::x86_64::*; use core::cmp; use core::mem::size_of; const VECTOR_SIZE: usize = size_of::<__m128i>(); const VECTOR_ALIGN: usize = VECTOR_SIZE - 1; // The number of bytes to loop at in one iteration of memchr/memrchr. const LOOP_SIZE: usize = 4 * VECTOR_SIZE; // The number of bytes to loop at in one iteration of memchr2/memrchr2 and // memchr3/memrchr3. There was no observable difference between 64 and 32 bytes // in benchmarks. memchr3 in particular only gets a very slight speed up from // the loop unrolling. const LOOP_SIZE2: usize = 2 * VECTOR_SIZE; #[target_feature(enable = "sse2")] pub unsafe fn memchr(n1: u8, haystack: &[u8]) -> Option { // What follows is a fast SSE2-only algorithm to detect the position of // `n1` in `haystack` if it exists. From what I know, this is the "classic" // algorithm. I believe it can be found in places like glibc and Go's // standard library. It appears to be well known and is elaborated on in // more detail here: https://gms.tf/stdfind-and-memchr-optimizations.html // // While this routine is very long, the basic idea is actually very simple // and can be expressed straight-forwardly in pseudo code: // // needle = (n1 << 15) | (n1 << 14) | ... | (n1 << 1) | n1 // while i <= haystack.len() - 16: // // A 16 byte vector. Each byte in chunk corresponds to a byte in // // the haystack. // chunk = haystack[i:i+16] // // Compare bytes in needle with bytes in chunk. The result is a 16 // // byte chunk where each byte is 0xFF if the corresponding bytes // // in needle and chunk were equal, or 0x00 otherwise. // eqs = cmpeq(needle, chunk) // // Return a 32 bit integer where the most significant 16 bits // // are always 0 and the lower 16 bits correspond to whether the // // most significant bit in the correspond byte in `eqs` is set. // // In other words, `mask as u16` has bit i set if and only if // // needle[i] == chunk[i]. // mask = movemask(eqs) // // // Mask is 0 if there is no match, and non-zero otherwise. // if mask != 0: // // trailing_zeros tells us the position of the least significant // // bit that is set. // return i + trailing_zeros(mask) // // // haystack length may not be a multiple of 16, so search the rest. // while i < haystack.len(): // if haystack[i] == n1: // return i // // // No match found. // return NULL // // In fact, we could loosely translate the above code to Rust line-for-line // and it would be a pretty fast algorithm. But, we pull out all the stops // to go as fast as possible: // // 1. We use aligned loads. That is, we do some finagling to make sure our // primary loop not only proceeds in increments of 16 bytes, but that // the address of haystack's pointer that we dereference is aligned to // 16 bytes. 16 is a magic number here because it is the size of SSE2 // 128-bit vector. (For the AVX2 algorithm, 32 is the magic number.) // Therefore, to get aligned loads, our pointer's address must be evenly // divisible by 16. // 2. Our primary loop proceeds 64 bytes at a time instead of 16. It's // kind of like loop unrolling, but we combine the equality comparisons // using a vector OR such that we only need to extract a single mask to // determine whether a match exists or not. If so, then we do some // book-keeping to determine the precise location but otherwise mush on. // 3. We use our "chunk" comparison routine in as many places as possible, // even if it means using unaligned loads. In particular, if haystack // starts with an unaligned address, then we do an unaligned load to // search the first 16 bytes. We then start our primary loop at the // smallest subsequent aligned address, which will actually overlap with // previously searched bytes. But we're OK with that. We do a similar // dance at the end of our primary loop. Finally, to avoid a // byte-at-a-time loop at the end, we do a final 16 byte unaligned load // that may overlap with a previous load. This is OK because it converts // a loop into a small number of very fast vector instructions. // // The primary downside of this algorithm is that it's effectively // completely unsafe. Therefore, we have to be super careful to avoid // undefined behavior: // // 1. We use raw pointers everywhere. Not only does dereferencing a pointer // require the pointer to be valid, but we actually can't even store the // address of an invalid pointer (unless it's 1 past the end of // haystack) without sacrificing performance. // 2. _mm_loadu_si128 is used when you don't care about alignment, and // _mm_load_si128 is used when you do care. You cannot use the latter // on unaligned pointers. // 3. We make liberal use of debug_assert! to check assumptions. // 4. We make a concerted effort to stick with pointers instead of indices. // Indices are nicer because there's less to worry about with them (see // above about pointer offsets), but I could not get the compiler to // produce as good of code as what the below produces. In any case, // pointers are what we really care about here, and alignment is // expressed a bit more naturally with them. // // In general, most of the algorithms in this crate have a similar // structure to what you see below, so this comment applies fairly well to // all of them. let vn1 = _mm_set1_epi8(n1 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; if haystack.len() < VECTOR_SIZE { while ptr < end_ptr { if *ptr == n1 { return Some(sub(ptr, start_ptr)); } ptr = ptr.offset(1); } return None; } if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); while loop_size == LOOP_SIZE && ptr <= end_ptr.sub(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); let a = _mm_load_si128(ptr as *const __m128i); let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); let eqa = _mm_cmpeq_epi8(vn1, a); let eqb = _mm_cmpeq_epi8(vn1, b); let eqc = _mm_cmpeq_epi8(vn1, c); let eqd = _mm_cmpeq_epi8(vn1, d); let or1 = _mm_or_si128(eqa, eqb); let or2 = _mm_or_si128(eqc, eqd); let or3 = _mm_or_si128(or1, or2); if _mm_movemask_epi8(or3) != 0 { let mut at = sub(ptr, start_ptr); let mask = _mm_movemask_epi8(eqa); if mask != 0 { return Some(at + forward_pos(mask)); } at += VECTOR_SIZE; let mask = _mm_movemask_epi8(eqb); if mask != 0 { return Some(at + forward_pos(mask)); } at += VECTOR_SIZE; let mask = _mm_movemask_epi8(eqc); if mask != 0 { return Some(at + forward_pos(mask)); } at += VECTOR_SIZE; let mask = _mm_movemask_epi8(eqd); debug_assert!(mask != 0); return Some(at + forward_pos(mask)); } ptr = ptr.add(loop_size); } while ptr <= end_ptr.sub(VECTOR_SIZE) { debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE); if let Some(i) = forward_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } ptr = ptr.add(VECTOR_SIZE); } if ptr < end_ptr { debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); return forward_search1(start_ptr, end_ptr, ptr, vn1); } None } #[target_feature(enable = "sse2")] pub unsafe fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { let vn1 = _mm_set1_epi8(n1 as i8); let vn2 = _mm_set1_epi8(n2 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE2, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; if haystack.len() < VECTOR_SIZE { while ptr < end_ptr { if *ptr == n1 || *ptr == n2 { return Some(sub(ptr, start_ptr)); } ptr = ptr.offset(1); } return None; } if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); let a = _mm_load_si128(ptr as *const __m128i); let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); let eqa1 = _mm_cmpeq_epi8(vn1, a); let eqb1 = _mm_cmpeq_epi8(vn1, b); let eqa2 = _mm_cmpeq_epi8(vn2, a); let eqb2 = _mm_cmpeq_epi8(vn2, b); let or1 = _mm_or_si128(eqa1, eqb1); let or2 = _mm_or_si128(eqa2, eqb2); let or3 = _mm_or_si128(or1, or2); if _mm_movemask_epi8(or3) != 0 { let mut at = sub(ptr, start_ptr); let mask1 = _mm_movemask_epi8(eqa1); let mask2 = _mm_movemask_epi8(eqa2); if mask1 != 0 || mask2 != 0 { return Some(at + forward_pos2(mask1, mask2)); } at += VECTOR_SIZE; let mask1 = _mm_movemask_epi8(eqb1); let mask2 = _mm_movemask_epi8(eqb2); return Some(at + forward_pos2(mask1, mask2)); } ptr = ptr.add(loop_size); } while ptr <= end_ptr.sub(VECTOR_SIZE) { if let Some(i) = forward_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } ptr = ptr.add(VECTOR_SIZE); } if ptr < end_ptr { debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); return forward_search2(start_ptr, end_ptr, ptr, vn1, vn2); } None } #[target_feature(enable = "sse2")] pub unsafe fn memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8] ) -> Option { let vn1 = _mm_set1_epi8(n1 as i8); let vn2 = _mm_set1_epi8(n2 as i8); let vn3 = _mm_set1_epi8(n3 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE2, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; if haystack.len() < VECTOR_SIZE { while ptr < end_ptr { if *ptr == n1 || *ptr == n2 || *ptr == n3 { return Some(sub(ptr, start_ptr)); } ptr = ptr.offset(1); } return None; } if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN)); debug_assert!(ptr > start_ptr && end_ptr.sub(VECTOR_SIZE) >= start_ptr); while loop_size == LOOP_SIZE2 && ptr <= end_ptr.sub(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); let a = _mm_load_si128(ptr as *const __m128i); let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); let eqa1 = _mm_cmpeq_epi8(vn1, a); let eqb1 = _mm_cmpeq_epi8(vn1, b); let eqa2 = _mm_cmpeq_epi8(vn2, a); let eqb2 = _mm_cmpeq_epi8(vn2, b); let eqa3 = _mm_cmpeq_epi8(vn3, a); let eqb3 = _mm_cmpeq_epi8(vn3, b); let or1 = _mm_or_si128(eqa1, eqb1); let or2 = _mm_or_si128(eqa2, eqb2); let or3 = _mm_or_si128(eqa3, eqb3); let or4 = _mm_or_si128(or1, or2); let or5 = _mm_or_si128(or3, or4); if _mm_movemask_epi8(or5) != 0 { let mut at = sub(ptr, start_ptr); let mask1 = _mm_movemask_epi8(eqa1); let mask2 = _mm_movemask_epi8(eqa2); let mask3 = _mm_movemask_epi8(eqa3); if mask1 != 0 || mask2 != 0 || mask3 != 0 { return Some(at + forward_pos3(mask1, mask2, mask3)); } at += VECTOR_SIZE; let mask1 = _mm_movemask_epi8(eqb1); let mask2 = _mm_movemask_epi8(eqb2); let mask3 = _mm_movemask_epi8(eqb3); if mask1 != 0 || mask2 != 0 || mask3 != 0 { return Some(at + forward_pos3(mask1, mask2, mask3)); } at += VECTOR_SIZE; let mask1 = _mm_movemask_epi8(eqb1); let mask2 = _mm_movemask_epi8(eqb2); let mask3 = _mm_movemask_epi8(eqb3); return Some(at + forward_pos3(mask1, mask2, mask3)); } ptr = ptr.add(loop_size); } while ptr <= end_ptr.sub(VECTOR_SIZE) { if let Some(i) = forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } ptr = ptr.add(VECTOR_SIZE); } if ptr < end_ptr { debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); return forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); } None } #[target_feature(enable = "sse2")] pub unsafe fn memrchr(n1: u8, haystack: &[u8]) -> Option { let vn1 = _mm_set1_epi8(n1 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = end_ptr; if haystack.len() < VECTOR_SIZE { while ptr > start_ptr { ptr = ptr.offset(-1); if *ptr == n1 { return Some(sub(ptr, start_ptr)); } } return None; } ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while loop_size == LOOP_SIZE && ptr >= start_ptr.add(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); ptr = ptr.sub(loop_size); let a = _mm_load_si128(ptr as *const __m128i); let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i); let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i); let eqa = _mm_cmpeq_epi8(vn1, a); let eqb = _mm_cmpeq_epi8(vn1, b); let eqc = _mm_cmpeq_epi8(vn1, c); let eqd = _mm_cmpeq_epi8(vn1, d); let or1 = _mm_or_si128(eqa, eqb); let or2 = _mm_or_si128(eqc, eqd); let or3 = _mm_or_si128(or1, or2); if _mm_movemask_epi8(or3) != 0 { let mut at = sub(ptr.add(3 * VECTOR_SIZE), start_ptr); let mask = _mm_movemask_epi8(eqd); if mask != 0 { return Some(at + reverse_pos(mask)); } at -= VECTOR_SIZE; let mask = _mm_movemask_epi8(eqc); if mask != 0 { return Some(at + reverse_pos(mask)); } at -= VECTOR_SIZE; let mask = _mm_movemask_epi8(eqb); if mask != 0 { return Some(at + reverse_pos(mask)); } at -= VECTOR_SIZE; let mask = _mm_movemask_epi8(eqa); debug_assert!(mask != 0); return Some(at + reverse_pos(mask)); } } while ptr >= start_ptr.add(VECTOR_SIZE) { ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search1(start_ptr, end_ptr, ptr, vn1) { return Some(i); } } if ptr > start_ptr { debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); return reverse_search1(start_ptr, end_ptr, start_ptr, vn1); } None } #[target_feature(enable = "sse2")] pub unsafe fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { let vn1 = _mm_set1_epi8(n1 as i8); let vn2 = _mm_set1_epi8(n2 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE2, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = end_ptr; if haystack.len() < VECTOR_SIZE { while ptr > start_ptr { ptr = ptr.offset(-1); if *ptr == n1 || *ptr == n2 { return Some(sub(ptr, start_ptr)); } } return None; } ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); ptr = ptr.sub(loop_size); let a = _mm_load_si128(ptr as *const __m128i); let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); let eqa1 = _mm_cmpeq_epi8(vn1, a); let eqb1 = _mm_cmpeq_epi8(vn1, b); let eqa2 = _mm_cmpeq_epi8(vn2, a); let eqb2 = _mm_cmpeq_epi8(vn2, b); let or1 = _mm_or_si128(eqa1, eqb1); let or2 = _mm_or_si128(eqa2, eqb2); let or3 = _mm_or_si128(or1, or2); if _mm_movemask_epi8(or3) != 0 { let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); let mask1 = _mm_movemask_epi8(eqb1); let mask2 = _mm_movemask_epi8(eqb2); if mask1 != 0 || mask2 != 0 { return Some(at + reverse_pos2(mask1, mask2)); } at -= VECTOR_SIZE; let mask1 = _mm_movemask_epi8(eqa1); let mask2 = _mm_movemask_epi8(eqa2); return Some(at + reverse_pos2(mask1, mask2)); } } while ptr >= start_ptr.add(VECTOR_SIZE) { ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search2(start_ptr, end_ptr, ptr, vn1, vn2) { return Some(i); } } if ptr > start_ptr { debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); return reverse_search2(start_ptr, end_ptr, start_ptr, vn1, vn2); } None } #[target_feature(enable = "sse2")] pub unsafe fn memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option { let vn1 = _mm_set1_epi8(n1 as i8); let vn2 = _mm_set1_epi8(n2 as i8); let vn3 = _mm_set1_epi8(n3 as i8); let len = haystack.len(); let loop_size = cmp::min(LOOP_SIZE2, len); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = end_ptr; if haystack.len() < VECTOR_SIZE { while ptr > start_ptr { ptr = ptr.offset(-1); if *ptr == n1 || *ptr == n2 || *ptr == n3 { return Some(sub(ptr, start_ptr)); } } return None; } ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } ptr = (end_ptr as usize & !VECTOR_ALIGN) as *const u8; debug_assert!(start_ptr <= ptr && ptr <= end_ptr); while loop_size == LOOP_SIZE2 && ptr >= start_ptr.add(loop_size) { debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE); ptr = ptr.sub(loop_size); let a = _mm_load_si128(ptr as *const __m128i); let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i); let eqa1 = _mm_cmpeq_epi8(vn1, a); let eqb1 = _mm_cmpeq_epi8(vn1, b); let eqa2 = _mm_cmpeq_epi8(vn2, a); let eqb2 = _mm_cmpeq_epi8(vn2, b); let eqa3 = _mm_cmpeq_epi8(vn3, a); let eqb3 = _mm_cmpeq_epi8(vn3, b); let or1 = _mm_or_si128(eqa1, eqb1); let or2 = _mm_or_si128(eqa2, eqb2); let or3 = _mm_or_si128(eqa3, eqb3); let or4 = _mm_or_si128(or1, or2); let or5 = _mm_or_si128(or3, or4); if _mm_movemask_epi8(or5) != 0 { let mut at = sub(ptr.add(VECTOR_SIZE), start_ptr); let mask1 = _mm_movemask_epi8(eqb1); let mask2 = _mm_movemask_epi8(eqb2); let mask3 = _mm_movemask_epi8(eqb3); if mask1 != 0 || mask2 != 0 || mask3 != 0 { return Some(at + reverse_pos3(mask1, mask2, mask3)); } at -= VECTOR_SIZE; let mask1 = _mm_movemask_epi8(eqa1); let mask2 = _mm_movemask_epi8(eqa2); let mask3 = _mm_movemask_epi8(eqa3); return Some(at + reverse_pos3(mask1, mask2, mask3)); } } while ptr >= start_ptr.add(VECTOR_SIZE) { ptr = ptr.sub(VECTOR_SIZE); if let Some(i) = reverse_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3) { return Some(i); } } if ptr > start_ptr { debug_assert!(sub(ptr, start_ptr) < VECTOR_SIZE); return reverse_search3(start_ptr, end_ptr, start_ptr, vn1, vn2, vn3); } None } #[target_feature(enable = "sse2")] pub unsafe fn forward_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm_loadu_si128(ptr as *const __m128i); let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(chunk, vn1)); if mask != 0 { Some(sub(ptr, start_ptr) + forward_pos(mask)) } else { None } } #[target_feature(enable = "sse2")] unsafe fn forward_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm_loadu_si128(ptr as *const __m128i); let eq1 = _mm_cmpeq_epi8(chunk, vn1); let eq2 = _mm_cmpeq_epi8(chunk, vn2); if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { let mask1 = _mm_movemask_epi8(eq1); let mask2 = _mm_movemask_epi8(eq2); Some(sub(ptr, start_ptr) + forward_pos2(mask1, mask2)) } else { None } } #[target_feature(enable = "sse2")] pub unsafe fn forward_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm_loadu_si128(ptr as *const __m128i); let eq1 = _mm_cmpeq_epi8(chunk, vn1); let eq2 = _mm_cmpeq_epi8(chunk, vn2); let eq3 = _mm_cmpeq_epi8(chunk, vn3); let or = _mm_or_si128(eq1, eq2); if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { let mask1 = _mm_movemask_epi8(eq1); let mask2 = _mm_movemask_epi8(eq2); let mask3 = _mm_movemask_epi8(eq3); Some(sub(ptr, start_ptr) + forward_pos3(mask1, mask2, mask3)) } else { None } } #[target_feature(enable = "sse2")] unsafe fn reverse_search1( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm_loadu_si128(ptr as *const __m128i); let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(vn1, chunk)); if mask != 0 { Some(sub(ptr, start_ptr) + reverse_pos(mask)) } else { None } } #[target_feature(enable = "sse2")] unsafe fn reverse_search2( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm_loadu_si128(ptr as *const __m128i); let eq1 = _mm_cmpeq_epi8(chunk, vn1); let eq2 = _mm_cmpeq_epi8(chunk, vn2); if _mm_movemask_epi8(_mm_or_si128(eq1, eq2)) != 0 { let mask1 = _mm_movemask_epi8(eq1); let mask2 = _mm_movemask_epi8(eq2); Some(sub(ptr, start_ptr) + reverse_pos2(mask1, mask2)) } else { None } } #[target_feature(enable = "sse2")] unsafe fn reverse_search3( start_ptr: *const u8, end_ptr: *const u8, ptr: *const u8, vn1: __m128i, vn2: __m128i, vn3: __m128i, ) -> Option { debug_assert!(sub(end_ptr, start_ptr) >= VECTOR_SIZE); debug_assert!(start_ptr <= ptr); debug_assert!(ptr <= end_ptr.sub(VECTOR_SIZE)); let chunk = _mm_loadu_si128(ptr as *const __m128i); let eq1 = _mm_cmpeq_epi8(chunk, vn1); let eq2 = _mm_cmpeq_epi8(chunk, vn2); let eq3 = _mm_cmpeq_epi8(chunk, vn3); let or = _mm_or_si128(eq1, eq2); if _mm_movemask_epi8(_mm_or_si128(or, eq3)) != 0 { let mask1 = _mm_movemask_epi8(eq1); let mask2 = _mm_movemask_epi8(eq2); let mask3 = _mm_movemask_epi8(eq3); Some(sub(ptr, start_ptr) + reverse_pos3(mask1, mask2, mask3)) } else { None } } /// Compute the position of the first matching byte from the given mask. The /// position returned is always in the range [0, 15]. /// /// The mask given is expected to be the result of _mm_movemask_epi8. fn forward_pos(mask: i32) -> usize { // We are dealing with little endian here, where the most significant byte // is at a higher address. That means the least significant bit that is set // corresponds to the position of our first matching byte. That position // corresponds to the number of zeros after the least significant bit. mask.trailing_zeros() as usize } /// Compute the position of the first matching byte from the given masks. The /// position returned is always in the range [0, 15]. Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm_movemask_epi8, where /// at least one of the masks is non-zero (i.e., indicates a match). fn forward_pos2(mask1: i32, mask2: i32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0); forward_pos(mask1 | mask2) } /// Compute the position of the first matching byte from the given masks. The /// position returned is always in the range [0, 15]. Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm_movemask_epi8, where /// at least one of the masks is non-zero (i.e., indicates a match). fn forward_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); forward_pos(mask1 | mask2 | mask3) } /// Compute the position of the last matching byte from the given mask. The /// position returned is always in the range [0, 15]. /// /// The mask given is expected to be the result of _mm_movemask_epi8. fn reverse_pos(mask: i32) -> usize { // We are dealing with little endian here, where the most significant byte // is at a higher address. That means the most significant bit that is set // corresponds to the position of our last matching byte. The position from // the end of the mask is therefore the number of leading zeros in a 16 // bit integer, and the position from the start of the mask is therefore // 16 - (leading zeros) - 1. VECTOR_SIZE - (mask as u16).leading_zeros() as usize - 1 } /// Compute the position of the last matching byte from the given masks. The /// position returned is always in the range [0, 15]. Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm_movemask_epi8, where /// at least one of the masks is non-zero (i.e., indicates a match). fn reverse_pos2(mask1: i32, mask2: i32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0); reverse_pos(mask1 | mask2) } /// Compute the position of the last matching byte from the given masks. The /// position returned is always in the range [0, 15]. Each mask corresponds to /// the equality comparison of a single byte. /// /// The masks given are expected to be the result of _mm_movemask_epi8, where /// at least one of the masks is non-zero (i.e., indicates a match). fn reverse_pos3(mask1: i32, mask2: i32, mask3: i32) -> usize { debug_assert!(mask1 != 0 || mask2 != 0 || mask3 != 0); reverse_pos(mask1 | mask2 | mask3) } /// Subtract `b` from `a` and return the difference. `a` should be greater than /// or equal to `b`. fn sub(a: *const u8, b: *const u8) -> usize { debug_assert!(a >= b); (a as usize) - (b as usize) } memchr-2.2.1/src/x86/sse42.rs010064400017500000144000000046551335001377300137420ustar0000000000000000// This code is unused. PCMPESTRI is gratuitously slow. I imagine it might // start winning with a hypothetical memchr4 (or greater). This technique might // also be good for exposing searches over ranges of bytes, but that departs // from the standard memchr API, so it's not clear whether we actually want // that or not. // // N.B. PCMPISTRI appears to be about twice as fast as PCMPESTRI, which is kind // of neat. Unfortunately, UTF-8 strings can contain NUL bytes, which means // I don't see a way of effectively using PCMPISTRI unless there's some fast // way to replace zero bytes with a byte that is not not a needle byte. use core::arch::x86_64::*; use core::mem::size_of; use x86::sse2; const VECTOR_SIZE: usize = size_of::<__m128i>(); const CONTROL_ANY: i32 = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_POSITIVE_POLARITY | _SIDD_LEAST_SIGNIFICANT; #[target_feature(enable = "sse4.2")] pub unsafe fn memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8] ) -> Option { let vn1 = _mm_set1_epi8(n1 as i8); let vn2 = _mm_set1_epi8(n2 as i8); let vn3 = _mm_set1_epi8(n3 as i8); let vn = _mm_setr_epi8( n1 as i8, n2 as i8, n3 as i8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ); let len = haystack.len(); let start_ptr = haystack.as_ptr(); let end_ptr = haystack[haystack.len()..].as_ptr(); let mut ptr = start_ptr; if haystack.len() < VECTOR_SIZE { while ptr < end_ptr { if *ptr == n1 || *ptr == n2 || *ptr == n3 { return Some(sub(ptr, start_ptr)); } ptr = ptr.offset(1); } return None; } while ptr <= end_ptr.sub(VECTOR_SIZE) { let chunk = _mm_loadu_si128(ptr as *const __m128i); let res = _mm_cmpestri(vn, 3, chunk, 16, CONTROL_ANY); if res < 16 { return Some(sub(ptr, start_ptr) + res as usize); } ptr = ptr.add(VECTOR_SIZE); } if ptr < end_ptr { debug_assert!(sub(end_ptr, ptr) < VECTOR_SIZE); ptr = ptr.sub(VECTOR_SIZE - sub(end_ptr, ptr)); debug_assert_eq!(sub(end_ptr, ptr), VECTOR_SIZE); return sse2::forward_search3(start_ptr, end_ptr, ptr, vn1, vn2, vn3); } None } /// Subtract `b` from `a` and return the difference. `a` should be greater than /// or equal to `b`. fn sub(a: *const u8, b: *const u8) -> usize { debug_assert!(a >= b); (a as usize) - (b as usize) } memchr-2.2.1/.cargo_vcs_info.json0000644000000001120000000000000123420ustar00{ "git": { "sha1": "75155594dd6c130d47c944bbf31813f454d94341" } }