triple_accel-0.4.0/.cargo_vcs_info.json0000644000000001120000000000000134540ustar { "git": { "sha1": "e921c1450a78b4d75bce1e81964e9a84bc4ffefa" } } triple_accel-0.4.0/.github/workflows/build_triple_accel.yml000064400000000000000000000041760000000000000221650ustar 00000000000000name: Test on: push: branches: [master] pull_request: branches: [master] jobs: build: runs-on: ${{matrix.os}} strategy: matrix: os: [ubuntu-latest, windows-latest, macos-latest] steps: - uses: actions/checkout@v2 - name: Run tests AVX2/8-bit run: cargo test --verbose --no-default-features --features "debug jewel-avx jewel-8bit" -- --nocapture - name: Run bench tests AVX2/8-bit run: cargo test --bench rand_benchmarks --verbose --no-default-features --features "debug jewel-avx jewel-8bit" -- --nocapture - name: Run tests AVX2/16-bit run: cargo test --verbose --no-default-features --features "debug jewel-avx jewel-16bit" -- --nocapture - name: Run bench tests AVX2/16-bit run: cargo test --bench rand_benchmarks --verbose --no-default-features --features "debug jewel-avx jewel-16bit" -- --nocapture - name: Run tests AVX2/32-bit run: cargo test --verbose --no-default-features --features "debug jewel-avx jewel-32bit" -- --nocapture - name: Run bench tests AVX2/32-bit run: cargo test --bench rand_benchmarks --verbose --no-default-features --features "debug jewel-avx jewel-32bit" -- --nocapture - name: Run tests SSE4.1/8-bit run: cargo test --verbose --no-default-features --features "debug jewel-sse jewel-8bit" -- --nocapture - name: Run bench tests SSE4.1/8-bit run: cargo test --bench rand_benchmarks --verbose --no-default-features --features "debug jewel-sse jewel-8bit" -- --nocapture - name: Run tests SSE4.1/16-bit run: cargo test --verbose --no-default-features --features "debug jewel-sse jewel-16bit" -- --nocapture - name: Run bench tests SSE4.1/16-bit run: cargo test --bench rand_benchmarks --verbose --no-default-features --features "debug jewel-sse jewel-16bit" -- --nocapture - name: Run tests SSE4.1/32-bit run: cargo test --verbose --no-default-features --features "debug jewel-sse jewel-32bit" -- --nocapture - name: Run bench tests SSE4.1/32-bit run: cargo test --bench rand_benchmarks --verbose --no-default-features --features "debug jewel-sse jewel-32bit" -- --nocapture triple_accel-0.4.0/.gitignore000064400000000000000000000000230000000000000142130ustar 00000000000000/target Cargo.lock triple_accel-0.4.0/CODE_OF_CONDUCT.md000064400000000000000000000064330000000000000150350ustar 00000000000000# Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at danielliu@liudaniel.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq triple_accel-0.4.0/CONTRIBUTING.md000064400000000000000000000014510000000000000144620ustar 00000000000000# Contributing Contributions are welcome! Here are some basic guidelines: * Open an issue for questions or bug reports. Be detailed about the platform and CPU features so the bug can be reproduced. * Make sure all tests pass for pull requests. Note that both the SIMD and scalar variants of the code should return the same result. Remember to add tests for new features! * Performance regressions due to code changes should be reported in the pull requests. New benchmarks should be added, if necessary. Performance is very important in this library! * Use a similar code style as the current code for pull requests. * It may be helpful to inspect the LLVM-IR or assembly output by using `build_ir_asm.sh`. * It may be helpful to use the debug feature flag through `--features "debug"` to get debug output. triple_accel-0.4.0/Cargo.toml0000644000000030220000000000000114550ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "triple_accel" version = "0.4.0" authors = ["c0deb0t "] description = "Rust edit distance routines accelerated using SIMD. Supports fast Hamming, Levenshtein, restricted Damerau-Levenshtein, etc. distance calculations and string search." homepage = "https://github.com/Daniel-Liu-c0deb0t/triple_accel" readme = "README.md" keywords = ["SIMD", "levenshtein", "hamming", "string-search", "string-distance"] categories = ["algorithms", "hardware-support", "science", "text-processing"] license = "MIT" repository = "https://github.com/Daniel-Liu-c0deb0t/triple_accel" [profile.release] opt-level = 3 codegen-units = 1 panic = "abort" [[bench]] name = "rand_benchmarks" harness = false [dependencies] [dev-dependencies.criterion] version = "0.3" [dev-dependencies.rand] version = "0.7.3" [features] debug = [] default = ["jewel-avx", "jewel-sse", "jewel-8bit", "jewel-16bit", "jewel-32bit"] jewel-16bit = [] jewel-32bit = [] jewel-8bit = [] jewel-avx = [] jewel-sse = [] triple_accel-0.4.0/Cargo.toml.orig000064400000000000000000000020520000000000000151160ustar 00000000000000[package] name = "triple_accel" version = "0.4.0" authors = ["c0deb0t "] edition = "2018" license = "MIT" description = "Rust edit distance routines accelerated using SIMD. Supports fast Hamming, Levenshtein, restricted Damerau-Levenshtein, etc. distance calculations and string search." homepage = "https://github.com/Daniel-Liu-c0deb0t/triple_accel" repository = "https://github.com/Daniel-Liu-c0deb0t/triple_accel" readme = "README.md" keywords = ["SIMD", "levenshtein", "hamming", "string-search", "string-distance"] categories = ["algorithms", "hardware-support", "science", "text-processing"] [features] # automatic selection of which type of Jewel vector to use by default default = ["jewel-avx", "jewel-sse", "jewel-8bit", "jewel-16bit", "jewel-32bit"] jewel-avx = [] jewel-sse = [] jewel-8bit = [] jewel-16bit = [] jewel-32bit = [] debug = [] [profile.release] opt-level = 3 codegen-units = 1 panic = "abort" [[bench]] name = "rand_benchmarks" harness = false [dependencies] [dev-dependencies] criterion = "0.3" rand = "0.7.3" triple_accel-0.4.0/LICENSE000064400000000000000000000020530000000000000132350ustar 00000000000000MIT License Copyright (c) 2020 Daniel Liu Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. triple_accel-0.4.0/README.md000064400000000000000000000151270000000000000135150ustar 00000000000000# triple_accel ![Test](https://github.com/Daniel-Liu-c0deb0t/triple_accel/workflows/Test/badge.svg) ![GitHub](https://img.shields.io/github/license/Daniel-Liu-c0deb0t/triple_accel) ![Crates.io](https://img.shields.io/crates/v/triple_accel) ![Docs.rs](https://docs.rs/triple_accel/badge.svg) Rust edit distance routines accelerated using SIMD. Supports fast Hamming, Levenshtein, restricted Damerau-Levenshtein, etc. distance calculations and string search. Although vectorized SIMD code allows for up to 20-30x speedups over their scalar counterparts, the difficulty of handling platform-dependent SIMD code makes SIMD routines less attractive. The goal of this library is to provide an easy-to-use abstraction over SIMD edit distance routines that fall back to scalar routines if the target CPU architecture is not supported. Additionally, all limitations and tradeoffs of the edit distance routines should be provided upfront so the user knows exactly what to expect. Finally, this library should lead to performance boosts on both short and longer strings, so it can be used for a variety of tasks, from bioinformatics to natural language processing. `triple_accel` is very lightweight: it only has dependencies on other crates for benchmarking. It can be built on machines without CPUs that have AVX2 or SSE4.1 support. It can also run on machines without SIMD support by automatically using scalar alternatives. ## Install Add ``` triple_accel = "*" ``` to the `[dependencies]` section of your `Cargo.toml`. This library is available [here](https://crates.io/crates/triple_accel) on crates.io. Alternatively, you can clone this repository and run ``` cargo build --release ``` In general, for maximum efficiency, use `RUSTFLAGS="-C target-cpu=native"` if portability is not an issue. ## Tests You can run tests with ``` cargo test ``` after cloning the repository. Continuous integration is used to ensure that the code passes all tests on the latest Linux, Windows, and Mac platforms. Additionally, crate feature flags like `jewel-sse`, `jewel-avx`, `jewel-8bit`, `jewel-16bit`, and `jewel-32bit` are used to override the default automatic detection of CPU features, so all features can be thoroughly tested in continuous integration. The `debug` feature flag is specified, so the exact underlying vector type that is used is printed. ## Benchmarks Benchmarks can be ran with ``` cargo bench ``` ## Docs The docs are available [here](https://docs.rs/triple_accel). To build them on your machine, run ``` cargo doc ``` ## Features This library provides routines for both searching for some needle string in a haystack string and calculating the edit distance between two strings. Hamming distance (mismatches only), Levenshtein distance (mismatches + gaps), and restricted Damerau-Levenshtein distance (transpositions + mismatches + gaps) are supported, along with arbitrary edit costs. This library provides a simple interface, in addition to powerful lower-level control over the edit distance calculations. At runtime, the implementation for a certain algorithm is selected based on CPU support, going down the list: 1. Vectorized implementation with 256-bit AVX vectors, if AVX2 is supported. 2. Vectorized implementation with 128-bit SSE vectors, if SSE4.1 is supported. 3. Scalar implementation. Currently, vectorized SIMD implementations are only available for x86 or x86-64 CPUs. However, after compiling this library on a machine that supports those SIMD intrinsics, the library can be used on other machines. Additionally, the internal data structure for storing vectors and the bit width of the values in the vectors are selected at runtime for maximum efficiency and accuracy, given the lengths of the input strings. ## Limitations Due to the use of SIMD intrinsics, only binary strings that are represented with `u8` bytes are supported. Unicode strings are not currently supported. ## Examples `triple_accel` provides a very simple and easy to use framework for common edit distance operations. Calculating the Hamming distance (number of mismatches) between two strings is extremely simple: ```Rust use triple_accel::*; let a = b"abcd"; let b = b"abcc"; let dist = hamming(a, b); assert!(dist == 1); ``` By default, SIMD will be used if possible. Similarly, we can easily calculate the Levenshtein distance (character mismatches and gaps all have a cost of 1) between two strings with the following code: ```Rust use triple_accel::*; let a = b"abc"; let b = b"abcd"; let dist = levenshtein_exp(a, b); assert!(dist == 1); ``` This uses exponential search to estimate the number of edits between `a` and `b`, which makes it more efficient than the alternative `levenshtein` function when the number of edits between `a` and `b` is low. In addition to edit distance routines, `triple_accel` also provides search routines. These routines return an iterator over matches that indicate where the `needle` string matches the `haystack` string. `triple_accel` will attempt to maximize the length of matches that end at the same position and remove shorter matches when some matches fully overlap. ```Rust use triple_accel::*; let needle = b"helllo"; let haystack = b"hello world"; let matches: Vec = levenshtein_search(needle, haystack).collect(); // note: start index is inclusive, end index is exclusive! assert!(matches == vec![Match{start: 0, end: 5, k: 1}]); ``` Sometimes, it is necessary to use the slightly lower level, but also more powerful routines that `triple_accel` provides. For example, it is possible to allow transpositions (character swaps) that have a cost of 1, in addition to mismatches and gaps: ```Rust use triple_accel::levenshtein::*; let a = b"abcd"; let b = b"abdc"; let k = 2; // upper bound on allowed cost let trace_on = false; // return edit traceback? let dist = levenshtein_simd_k_with_opts(a, b, k, trace_on, RDAMERAU_COSTS); // note: dist may be None if a and b do not match within a cost of k assert!(dist.unwrap().0 == 1); ``` Don't let the name of the function fool you! `levenshtein_simd_k_with_opts` will still fall back to the scalar implementation if AVX2 or SSE4.1 support is not available. It just prefers to use SIMD where possible. For most common cases, the re-exported functions are enough, and the low level functions do not have to be used directly. ## License [MIT](LICENSE) ## Contributing Read the contributing guidelines [here](CONTRIBUTING.md). ## Code of Conduct Read the code of conduct [here](CODE_OF_CONDUCT.md). ## Why the name "triple_accel"? Because "Time Altar - Triple Accel" is a magical ability used by Kiritsugu Emiya to boost his speed and reaction time in Fate/Zero. There are also some other references to the Fate series... triple_accel-0.4.0/benches/rand_benchmarks.rs000064400000000000000000000243470000000000000173400ustar 00000000000000use criterion::*; use rand::prelude::*; use triple_accel::*; use triple_accel::levenshtein::*; use triple_accel::hamming::*; fn bench_rand_hamming(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(1234); let mut group = c.benchmark_group("bench_rand_hamming"); let config = PlotConfiguration::default().summary_scale(AxisScale::Logarithmic); group.plot_config(config); for str_len in [10, 100, 1000].iter() { let k = black_box(((*str_len) as u32) / 10); let (a_str, b_str) = black_box(rand_hamming_pair(*str_len, k, &mut rng)); let res = hamming_naive(&a_str, &b_str); assert!(res == hamming_words_64(&a_str, &b_str)); assert!(res == hamming_words_128(&a_str, &b_str)); assert!(res == hamming_simd_movemask(&a_str, &b_str)); assert!(res == hamming_simd_parallel(&a_str, &b_str)); group.bench_function(BenchmarkId::new("hamming_naive", *str_len), |b| b.iter(|| hamming_naive(&a_str, &b_str))); group.bench_function(BenchmarkId::new("hamming_words_64", *str_len), |b| b.iter(|| hamming_words_64(&a_str, &b_str))); group.bench_function(BenchmarkId::new("hamming_words_128", *str_len), |b| b.iter(|| hamming_words_128(&a_str, &b_str))); group.bench_function(BenchmarkId::new("hamming_simd_movemask", *str_len), |b| b.iter(|| hamming_simd_movemask(&a_str, &b_str))); group.bench_function(BenchmarkId::new("hamming_simd_parallel", *str_len), |b| b.iter(|| hamming_simd_parallel(&a_str, &b_str))); } group.finish(); } fn bench_rand_hamming_search(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(1234); let mut group = c.benchmark_group("bench_rand_hamming_search"); let config = PlotConfiguration::default().summary_scale(AxisScale::Logarithmic); group.plot_config(config); for str_len in [100, 1000].iter() { let needle_len = black_box(*str_len / 10); let num_needles = black_box(*str_len / 20); let k = black_box(((*str_len) as u32) / 100); let (needle, haystack) = black_box(rand_hamming_needle_haystack(needle_len, *str_len, num_needles, k, &mut rng)); let res: Vec = hamming_search_naive_with_opts(&needle, &haystack, k, SearchType::All).collect(); assert!(res == hamming_search_simd_with_opts(&needle, &haystack, k, SearchType::All).collect::>()); group.bench_function(BenchmarkId::new("hamming_search_naive_k", *str_len), |b| b.iter(|| hamming_search_naive_with_opts(&needle, &haystack, k, SearchType::All).last())); group.bench_function(BenchmarkId::new("hamming_search_simd_k", *str_len), |b| b.iter(|| hamming_search_simd_with_opts(&needle, &haystack, k, SearchType::All).last())); } group.finish(); } fn bench_rand_levenshtein(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(1234); let mut group = c.benchmark_group("bench_rand_levenshtein"); let config = PlotConfiguration::default().summary_scale(AxisScale::Logarithmic); group.plot_config(config); for str_len in [10, 100, 1000].iter() { let k = black_box(((*str_len) as u32) / 10); let (a_str, b_str) = black_box(rand_levenshtein_pair(*str_len, k, &mut rng)); let res = levenshtein_naive(&a_str, &b_str); assert!(res == levenshtein_exp(&a_str, &b_str)); assert!(res == levenshtein(&a_str, &b_str)); group.bench_function(BenchmarkId::new("levenshtein_naive", *str_len), |b| b.iter(|| levenshtein_naive(&a_str, &b_str))); group.bench_function(BenchmarkId::new("levenshtein_exp", *str_len), |b| b.iter(|| levenshtein_exp(&a_str, &b_str))); group.bench_function(BenchmarkId::new("levenshtein", *str_len), |b| b.iter(|| levenshtein(&a_str, &b_str))); } group.finish(); } fn bench_rand_levenshtein_k(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(1234); let mut group = c.benchmark_group("bench_rand_levenshtein_k"); let config = PlotConfiguration::default().summary_scale(AxisScale::Logarithmic); group.plot_config(config); for str_len in [10, 100, 1000].iter() { let k = black_box(((*str_len) as u32) / 10); let trace_on = black_box(false); let (a_str, b_str) = black_box(rand_levenshtein_pair(*str_len, k, &mut rng)); let res = levenshtein_naive_with_opts(&a_str, &b_str, trace_on, LEVENSHTEIN_COSTS); assert!(res == levenshtein_naive_k_with_opts(&a_str, &b_str, k, trace_on, LEVENSHTEIN_COSTS).unwrap()); assert!(res == levenshtein_simd_k_with_opts(&a_str, &b_str, k, trace_on, LEVENSHTEIN_COSTS).unwrap()); group.bench_function(BenchmarkId::new("levenshtein_naive", *str_len), |b| b.iter(|| levenshtein_naive_with_opts(&a_str, &b_str, trace_on, LEVENSHTEIN_COSTS))); group.bench_function(BenchmarkId::new("levenshtein_naive_k", *str_len), |b| b.iter(|| levenshtein_naive_k_with_opts(&a_str, &b_str, k, trace_on, LEVENSHTEIN_COSTS))); group.bench_function(BenchmarkId::new("levenshtein_simd_k", *str_len), |b| b.iter(|| levenshtein_simd_k_with_opts(&a_str, &b_str, k, trace_on, LEVENSHTEIN_COSTS))); } group.finish(); } fn bench_rand_levenshtein_search(c: &mut Criterion) { let mut rng = StdRng::seed_from_u64(1234); let mut group = c.benchmark_group("bench_rand_levenshtein_search"); let config = PlotConfiguration::default().summary_scale(AxisScale::Logarithmic); group.plot_config(config); for str_len in [100, 1000].iter() { let needle_len = black_box(*str_len / 10); let num_needles = black_box(*str_len / 20); let k = black_box(((*str_len) as u32) / 100); let anchored = black_box(false); let (needle, haystack) = black_box(rand_levenshtein_needle_haystack(needle_len, *str_len, num_needles, k, &mut rng)); let res: Vec = levenshtein_search_naive_with_opts(&needle, &haystack, k, SearchType::All, LEVENSHTEIN_COSTS, anchored).collect(); assert!(res == levenshtein_search_simd_with_opts(&needle, &haystack, k, SearchType::All, LEVENSHTEIN_COSTS, anchored).collect::>()); group.bench_function(BenchmarkId::new("levenshtein_search_naive_k", *str_len), |b| b.iter(|| levenshtein_search_naive_with_opts(&needle, &haystack, k, SearchType::All, LEVENSHTEIN_COSTS, anchored).last())); group.bench_function(BenchmarkId::new("levenshtein_search_simd_k", *str_len), |b| b.iter(|| levenshtein_search_simd_with_opts(&needle, &haystack, k, SearchType::All, LEVENSHTEIN_COSTS, anchored).last())); } group.finish(); } criterion_group!(bench_rand, bench_rand_hamming, bench_rand_hamming_search, bench_rand_levenshtein, bench_rand_levenshtein_k, bench_rand_levenshtein_search); criterion_main!(bench_rand); fn rand_hamming_needle_haystack(needle_len: usize, haystack_len: usize, num_match: usize, k: u32, rng: &mut R) -> (Vec, Vec) { let mut idx: Vec = (0usize..haystack_len).collect(); idx.shuffle(rng); let mut insert = vec![false; haystack_len]; for i in 0..num_match { insert[idx[i]] = true; } let bytes: Vec = (33u8..127u8).collect(); let needle = rand_alloc_str(needle_len, rng); let mut haystack: Vec = vec![]; for i in 0..haystack_len { if insert[i] { let s = rand_hamming_mutate(&needle, k, rng); haystack.extend(&s[..needle_len]); }else{ haystack.push(*bytes.choose(rng).unwrap()); } } let mut haystack_final = alloc_str(haystack.len()); fill_str(&mut haystack_final, &haystack); (needle, haystack_final) } fn rand_hamming_pair(length: usize, k: u32, rng: &mut R) -> (Vec, Vec) { let a = rand_alloc_str(length, rng); let b = rand_hamming_mutate(&a, k, rng); (a, b) } fn rand_hamming_mutate(a: &[u8], k: u32, rng: &mut R) -> Vec { let mut b = alloc_str(a.len()); fill_str(&mut b, a); let curr_k: usize = rng.gen_range((k / 2) as usize, k as usize + 1); let mut idx: Vec = (0usize..a.len()).collect(); idx.shuffle(rng); for i in 0..curr_k { b[idx[i]] = 32u8; } b } fn rand_levenshtein_needle_haystack(needle_len: usize, haystack_len: usize, num_match: usize, k: u32, rng: &mut R) -> (Vec, Vec) { let mut idx: Vec = (0usize..haystack_len).collect(); idx.shuffle(rng); let mut insert = vec![false; haystack_len]; for i in 0..num_match { insert[idx[i]] = true; } let bytes: Vec = (33u8..127u8).collect(); let needle = rand_str(needle_len, rng); let mut haystack: Vec = vec![]; for i in 0..haystack_len { if insert[i] { let s = rand_levenshtein_mutate(&needle, k, rng); haystack.extend(&s); }else{ haystack.push(*bytes.choose(rng).unwrap()); } } (needle, haystack) } fn rand_levenshtein_pair(length: usize, k: u32, rng: &mut R) -> (Vec, Vec) { let a = rand_str(length, rng); let b = rand_levenshtein_mutate(&a, k, rng); (a, b) } fn rand_levenshtein_mutate(a: &[u8], k: u32, rng: &mut R) -> Vec { let mut edits = vec![0u8; a.len()]; let curr_k: usize = rng.gen_range((k / 2) as usize, k as usize + 1); let mut idx: Vec = (0usize..a.len()).collect(); idx.shuffle(rng); for i in 0..curr_k { edits[idx[i]] = rng.gen_range(1u8, 4u8); } let bytes: Vec = (33u8..127u8).collect(); let mut b = vec![]; for i in 0..a.len() { match edits[i] { 0u8 => { // same b.push(a[i]); }, 1u8 => { // diff b.push(32u8); }, 2u8 => { // insert b.push(*bytes.choose(rng).unwrap()); b.push(a[i]); }, 3u8 => (), // delete _ => panic!("This should not have been reached!") } } b } fn rand_str(length: usize, rng: &mut R) -> Vec { let bytes: Vec = (33u8..127u8).collect(); let mut res = vec![0u8; length]; for i in 0..length { res[i] = *bytes.choose(rng).unwrap(); } res } fn rand_alloc_str(length: usize, rng: &mut R) -> Vec { let bytes: Vec = (33u8..127u8).collect(); let mut res = alloc_str(length); for i in 0..length { res[i] = *bytes.choose(rng).unwrap(); } res } triple_accel-0.4.0/build_ir_asm.sh000075500000000000000000000000650000000000000152210ustar 00000000000000RUSTFLAGS="--emit llvm-ir,asm" cargo build --release triple_accel-0.4.0/src/hamming.rs000064400000000000000000000502460000000000000150140ustar 00000000000000//! This module provides many Hamming distance routines. //! //! These distance functions share the same efficient underlying SIMD-accelerated implementation: //! * `hamming` //! * `hamming_simd_parallel` //! //! These search functions share the same efficient underlying SIMD-accelerated implementation: //! * `hamming_search` //! * `hamming_search_simd` //! * `hamming_search_simd_with_opts` use std::*; use super::*; use super::jewel::*; /// Returns the hamming distance between two strings by naively counting mismatches. /// /// The length of `a` and `b` must be the same. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Panics /// * If the length of `a` does not equal the length of `b`. /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::hamming::*; /// let dist = hamming_naive(b"abc", b"abd"); /// /// assert!(dist == 1); /// ``` pub fn hamming_naive(a: &[u8], b: &[u8]) -> u32 { let len = a.len(); assert!(len == b.len()); let mut res = 0u32; for i in 0..len { res += (a[i] != b[i]) as u32; } res } /// Returns an iterator over best `Match`s by naively searching through the text `haystack` /// for the pattern `needle`. /// /// This is done by naively counting mismatches at every position in `haystack`. /// Only the matches with the lowest Hamming distance are returned. /// Each returned `Match` requires at least half or more bytes of the `needle` to match /// somewhere in the `haystack`. /// The length of `needle` must be less than or equal to the length of `haystack`. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::hamming::*; /// let matches: Vec = hamming_search_naive(b"abc", b" abd").collect(); /// /// assert!(matches == vec![Match{start: 2, end: 5, k: 1}]); /// ``` pub fn hamming_search_naive<'a>(needle: &'a [u8], haystack: &'a [u8]) -> Box + 'a> { hamming_search_naive_with_opts(needle, haystack, ((needle.len() as u32) >> 1) + ((needle.len() as u32) & 1), SearchType::Best) } /// Returns an iterator over `Match`s by naively searching through the text `haystack` /// for the pattern `needle`, with extra options. /// /// Only matches with less than `k` mismatches are returned. /// This is done by naively counting mismatches at every position in `haystack`. /// The length of `needle` must be less than or equal to the length of `haystack`. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// * `k` - number of mismatches allowed /// * `search_type` - whether to only return the "best" matches with the lowest Hamming distance, or /// all matches /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::hamming::*; /// let matches: Vec = hamming_search_naive_with_opts(b"abc", b" abd", 1, SearchType::All).collect(); /// /// assert!(matches == vec![Match{start: 2, end: 5, k: 1}]); /// ``` pub fn hamming_search_naive_with_opts<'a>(needle: &'a [u8], haystack: &'a [u8], k: u32, search_type: SearchType) -> Box + 'a> { let needle_len = needle.len(); let haystack_len = haystack.len(); if needle_len > haystack_len { return Box::new(iter::empty()); } let len = haystack_len + 1 - needle_len; let mut curr_k = k; let mut i = 0; let res = iter::from_fn(move || { 'outer: while i < len { let mut final_res = 0u32; for j in 0..needle_len { final_res += (needle[j] != haystack[i + j]) as u32; // early stop if final_res > curr_k { i += 1; continue 'outer; } } match search_type { SearchType::Best => curr_k = final_res, _ => () } i += 1; return Some((Match{start: i - 1, end: i + needle_len - 1, k: final_res}, curr_k)); } None }); if search_type == SearchType::Best { let mut res_vec = Vec::with_capacity(haystack_len / needle_len); res.for_each(|m| { res_vec.push(m.0); curr_k = m.1; }); return Box::new(res_vec.into_iter().filter(move |m| m.k == curr_k)); } Box::new(res.map(|m| m.0)) } /// Returns the hamming distance between two strings by efficiently counting mismatches in chunks of 64 bits. /// /// The length of `a` and `b` must be the same. /// Both `a` and `b` must be aligned and padded so they can be directly casted to chunks of `u64`. /// Use `alloc_str` to create aligned and padded strings. /// This should be faster than `hamming_naive` and maybe even `hamming_words_128`. This should be slower /// than `hamming_simd_parallel/movemask`. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Panics /// * If the length of `a` does not equal the length of `b`. /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::hamming::*; /// let mut a = alloc_str(3); /// let mut b = alloc_str(3); /// fill_str(&mut a, b"abc"); /// fill_str(&mut b, b"abd"); /// /// let dist = hamming_words_64(&a, &b); /// /// assert!(dist == 1); /// ``` pub fn hamming_words_64(a: &[u8], b: &[u8]) -> u32 { assert!(a.len() == b.len()); unsafe { let mut res = 0u32; // the pointer address better be aligned for u64 // may not be in little endian let a_ptr = a.as_ptr() as *const u64; let b_ptr = b.as_ptr() as *const u64; let words_len = (a.len() >> 3) as isize; for i in 0..words_len { // change to little endian omitted because it is not necessary in this case let mut r = (*a_ptr.offset(i)) ^ (*b_ptr.offset(i)); // reduce or by "folding" one half of each byte onto the other multiple times r |= r >> 4; // ...00001111 r &= 0x0f0f0f0f0f0f0f0fu64; r |= r >> 2; // ...00110011 r &= 0x3333333333333333u64; r |= r >> 1; // ...01010101 r &= 0x5555555555555555u64; res += r.count_ones(); } let words_rem = a.len() & 7; if words_rem > 0 { let mut r = (*a_ptr.offset(words_len)) ^ (*b_ptr.offset(words_len)); r |= r >> 4; r &= 0x0f0f0f0f0f0f0f0fu64; r |= r >> 2; r &= 0x3333333333333333u64; r |= r >> 1; r &= 0x5555555555555555u64; // make sure to mask out bits outside the string lengths res += (r & ((1u64 << ((words_rem as u64) << 3u64)) - 1u64)).count_ones(); } res } } /// Returns the hamming distance between two strings by counting mismatches in chunks of 128 bits. /// /// The length of `a` and `b` must be the same. /// Both `a` and `b` must be aligned and padded so they can be directly casted to chunks of `u128`. /// Use `alloc_str` to create aligned and padded strings. /// This may be slower than `hamming_words_64` in practice, probably since Rust `u128` is not as /// optimized. This should be slower than `hamming_simd_parallel/movemask`. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Panics /// * If the length of `a` does not equal the length of `b`. /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::hamming::*; /// let mut a = alloc_str(3); /// let mut b = alloc_str(3); /// fill_str(&mut a, b"abc"); /// fill_str(&mut b, b"abd"); /// /// let dist = hamming_words_128(&a, &b); /// /// assert!(dist == 1); /// ``` pub fn hamming_words_128(a: &[u8], b: &[u8]) -> u32 { assert!(a.len() == b.len()); unsafe { let mut res = 0u32; // the pointer address better be aligned for u128 // may not be in little endian let a_ptr = a.as_ptr() as *const u128; let b_ptr = b.as_ptr() as *const u128; let words_len = (a.len() >> 4) as isize; for i in 0..words_len { // change to little endian omitted because it is not necessary in this case let mut r = (*a_ptr.offset(i)) ^ (*b_ptr.offset(i)); // reduce or by "folding" one half of each byte onto the other multiple times r |= r >> 4; // ...00001111 r &= 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0fu128; r |= r >> 2; // ...00110011 r &= 0x33333333333333333333333333333333u128; r |= r >> 1; // ...01010101 r &= 0x55555555555555555555555555555555u128; res += r.count_ones(); } let words_rem = a.len() & 15; if words_rem > 0 { let mut r = (*a_ptr.offset(words_len)) ^ (*b_ptr.offset(words_len)); r |= r >> 4; r &= 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0fu128; r |= r >> 2; r &= 0x33333333333333333333333333333333u128; r |= r >> 1; r &= 0x55555555555555555555555555555555u128; // make sure to mask out bits outside the string lengths res += (r & ((1u128 << ((words_rem as u128) << 3u128)) - 1u128)).count_ones(); } res } } /// Returns the hamming distance between two strings by counting mismatches using SIMD vectors to /// increment multiple counters in parallel. /// /// The length of `a` and `b` must be the same. /// There are no constraints on how `a` and `b` are aligned and padded. /// This will automatically fall back to `hamming_naive`, if AVX2 and SSE4.1 are not supported. /// This should be faster than both `hamming_word_64/128` and `hamming_simd_movemask`. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Panics /// * If the length of `a` does not equal the length of `b`. /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::hamming::*; /// let dist = hamming_simd_parallel(b"abc", b"abd"); /// /// assert!(dist == 1); /// ``` pub fn hamming_simd_parallel(a: &[u8], b: &[u8]) -> u32 { assert!(a.len() == b.len()); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if cfg!(feature = "jewel-avx") && is_x86_feature_detected!("avx2") { return unsafe {Avx::count_mismatches(a.as_ptr(), b.as_ptr(), a.len())}; }else if cfg!(feature = "jewel-sse") && is_x86_feature_detected!("sse4.1") { return unsafe {Sse::count_mismatches(a.as_ptr(), b.as_ptr(), a.len())}; } } hamming_naive(a, b) } /// Returns the hamming distance between two strings by counting mismatches using the SIMD movemask intrinsic. /// /// The length of `a` and `b` must be the same. /// There are no constraints on how `a` and `b` are aligned and padded. /// This will automatically fall back to `hamming_naive`, if AVX2 and SSE4.1 are not supported. /// This should be faster than `hamming_word_64/128`, but slower than `hamming_simd_parallel`. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Panics /// * If the length of `a` does not equal the length of `b`. /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::hamming::*; /// let dist = hamming_simd_movemask(b"abc", b"abd"); /// /// assert!(dist == 1); /// ``` pub fn hamming_simd_movemask(a: &[u8], b: &[u8]) -> u32 { assert!(a.len() == b.len()); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if cfg!(feature = "jewel-avx") && is_x86_feature_detected!("avx2") { return unsafe {Avx::mm_count_mismatches(a.as_ptr(), b.as_ptr(), a.len())}; }else if cfg!(feature = "jewel-sse") && is_x86_feature_detected!("sse4.1") { return unsafe {Sse::mm_count_mismatches(a.as_ptr(), b.as_ptr(), a.len())}; } } hamming_naive(a, b) } /// Returns the hamming distance between two strings using the best method. /// /// The length of `a` and `b` must be the same. /// This will automatically fall back to a scalar alternative if AVX2 and /// SSE4.1 are not supported. /// Internally, this calls `hamming_simd_parallel`. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Panics /// * If the length of `a` does not equal the length of `b`. /// /// # Example /// ``` /// # use triple_accel::*; /// let dist = hamming(b"abc", b"abd"); /// /// assert!(dist == 1); /// ``` pub fn hamming(a: &[u8], b: &[u8]) -> u32 { hamming_simd_parallel(a, b) } /// Returns an iterator over best `Match`s by searching through the text `haystack` /// for the pattern `needle` using SIMD. /// /// This is done by counting mismatches at every position in `haystack`. /// This will automatically fall back to `hamming_search_naive_with_opts` if AVX2 and SSE4.1 /// are not supported. /// Null bytes/characters are not supported. /// The length of `needle` must be less than or equal to the length of `haystack`. /// Each returned `Match` requires at least half or more bytes of the `needle` to match /// somwhere in the `haystack`. /// Only the matches with the lowest Hamming distance are returned. /// This should be faster than `hamming_search_naive`. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// /// # Panics /// * When there are zero/null bytes in the `haystack` string. /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::hamming::*; /// let matches: Vec = hamming_search_simd(b"abc", b" abd").collect(); /// /// assert!(matches == vec![Match{start: 2, end: 5, k: 1}]); /// ``` pub fn hamming_search_simd<'a>(needle: &'a [u8], haystack: &'a [u8]) -> Box + 'a> { hamming_search_simd_with_opts(needle, haystack, ((needle.len() as u32) >> 1) + ((needle.len() as u32) & 1), SearchType::Best) } /// Returns an iterator over `Match`s by searching through the text `haystack` for the /// pattern `needle` using SIMD, with extra options. /// /// This is done by using SIMD to count mismatches at every position in `haystack`. /// This will automatically fall back to `hamming_search_naive_with_opts` if AVX2 and SSE4.1 /// are not supported. /// Null bytes/characters are not supported. /// The length of `needle` must be less than or equal to the length of `haystack`. /// This should be faster than `hamming_search_naive_with_opts`. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// * `k` - number of mismatches allowed /// * `search_type` - whether to only return the "best" matches with the lowest Hamming distance, or /// all matches /// /// # Panics /// * When there are zero/null bytes in the `haystack` string. /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::hamming::*; /// let matches: Vec = hamming_search_simd_with_opts(b"abc", b" abd", 1, SearchType::All).collect(); /// /// assert!(matches == vec![Match{start: 2, end: 5, k: 1}]); /// ``` pub fn hamming_search_simd_with_opts<'a>(needle: &'a [u8], haystack: &'a [u8], k: u32, search_type: SearchType) -> Box + 'a> { if needle.len() > haystack.len() { return Box::new(iter::empty()); } if needle.len() == 0 { return Box::new(iter::empty()); } check_no_null_bytes(haystack); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if cfg!(feature = "jewel-avx") && is_x86_feature_detected!("avx2") { return unsafe {hamming_search_simd_core_avx(needle, haystack, k, search_type)}; }else if cfg!(feature = "jewel-sse") && is_x86_feature_detected!("sse4.1") { return unsafe {hamming_search_simd_core_sse(needle, haystack, k, search_type)}; } } hamming_search_naive_with_opts(needle, haystack, k, search_type) } macro_rules! create_hamming_search_simd_core { ($name:ident, $jewel:ty, $target:literal) => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = $target)] unsafe fn $name<'a>(needle: &'a [u8], haystack: &'a [u8], k: u32, search_type: SearchType) -> Box + 'a> { #[cfg(feature = "debug")] { println!("Debug: Hamming search Jewel vector type {} for target {}.", stringify!($jewel), stringify!($target)); } let needle_len = needle.len(); let haystack_len = haystack.len(); let needle_vector = <$jewel>::loadu(needle.as_ptr(), needle_len); // calculate len using the unused bytes in the needle Jewel vector, for speed // there may be leftover positions in haystack that need to be calculated using a // scalar search afterwards // there should be no null bytes in the strings let len = if needle_vector.upper_bound() > haystack_len {0} else {haystack_len + 1 - needle_vector.upper_bound()}; let real_len = haystack_len + 1 - needle_len; let haystack_ptr = haystack.as_ptr(); let mut curr_k = k; let mut i = 0; let res = iter::from_fn(move || { while i < len { let final_res = <$jewel>::vector_count_mismatches(&needle_vector, haystack_ptr.offset(i as isize), needle_len); i += 1; if final_res <= curr_k { match search_type { SearchType::Best => curr_k = final_res, _ => () } return Some((Match{start: i - 1, end: i + needle_len - 1, k: final_res}, curr_k)); } } // scalar search 'outer: while i < real_len { let mut final_res = 0u32; for j in 0..needle_len { final_res += (*needle.get_unchecked(j) != *haystack.get_unchecked(i + j)) as u32; if final_res > curr_k { i += 1; continue 'outer; } } match search_type { SearchType::Best => curr_k = final_res, _ => () } i += 1; return Some((Match{start: i - 1, end: i + needle_len - 1, k: final_res}, curr_k)); } None }); if search_type == SearchType::Best { let mut res_vec = Vec::with_capacity(haystack_len / needle_len); res.for_each(|m| { res_vec.push(m.0); curr_k = m.1; }); return Box::new(res_vec.into_iter().filter(move |m| m.k == curr_k)); } Box::new(res.map(|m| m.0)) } }; } // generate different versions for different intrinsics #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_hamming_search_simd_core!(hamming_search_simd_core_avx, Avx, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_hamming_search_simd_core!(hamming_search_simd_core_sse, Sse, "sse4.1"); /// Returns an iterator over best `Match`s by searching through the text `haystack` /// for the pattern `needle` using SIMD. /// /// This will automatically fall back to a scalar alternative if AVX2 and SSE4.1 /// are not supported. /// Null bytes/characters are not supported. /// The length of `needle` must be less than or equal to the length of `haystack`. /// Each returned `Match` requires at least half or more bytes of the `needle` to match /// somewhere in the `haystack`. /// Only the matches with the lowest Hamming distance are returned. /// Internally, this calls `hamming_search_simd`. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// /// # Panics /// * When there are zero/null bytes in the `haystack` string. /// /// # Example /// ``` /// # use triple_accel::*; /// let matches: Vec = hamming_search(b"abc", b" abd").collect(); /// /// assert!(matches == vec![Match{start: 2, end: 5, k: 1}]); /// ``` pub fn hamming_search<'a>(needle: &'a [u8], haystack: &'a [u8]) -> Box + 'a> { hamming_search_simd(needle, haystack) } triple_accel-0.4.0/src/jewel.rs000064400000000000000000003155220000000000000145030ustar 00000000000000//! This module provides wrappers for SIMD intrinsics, so they can be used on multiple platforms. use std::*; #[cfg(target_arch = "x86")] use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; /// Jewel provides a uniform interface for SIMD operations. /// /// To save space, most operations are modify in place. /// Jewel vectors can be easily printed for debugging purposes. /// Additionally, the functions should be inlined to the caller for /// maximum efficiency. pub trait Jewel: fmt::Display { /// Functions for allocating memory and creating a new Jewel vector. unsafe fn repeating(val: u32, len: usize) -> Self; unsafe fn repeating_max(len: usize) -> Self; /// Figure out the length of the created vector, which may /// be longer than the length given by the caller. fn upper_bound(&self) -> usize; /// Figure out the length if it is static. fn static_upper_bound() -> usize; /// These operations do not have to be very efficient. unsafe fn slow_loadu(&mut self, idx: usize, ptr: *const u8, len: usize, reverse: bool); unsafe fn slow_extract(&self, i: usize) -> u32; unsafe fn slow_insert(&mut self, i: usize, val: u32); /// These operations modify the Jewel struct where the function is called. /// last_0 is the last element, last_1 is the second to last, etc. unsafe fn insert_last_0(&mut self, val: u32); unsafe fn insert_last_1(&mut self, val: u32); unsafe fn insert_last_2(&mut self, val: u32); unsafe fn insert_last_max(&mut self); unsafe fn insert_first(&mut self, val: u32); unsafe fn insert_first_max(&mut self); unsafe fn add_mut(&mut self, b: &Self); unsafe fn adds_mut(&mut self, b: &Self); unsafe fn and_mut(&mut self, b: &Self); unsafe fn andnot_mut(&mut self, b: &Self); unsafe fn cmpeq_mut(&mut self, b: &Self); unsafe fn min_mut(&mut self, b: &Self); unsafe fn max_mut(&mut self, b: &Self); unsafe fn blendv_mut(&mut self, b: &Self, mask: &Self); unsafe fn shift_left_1_mut(&mut self); unsafe fn shift_left_2_mut(&mut self); unsafe fn shift_right_1_mut(&mut self); /// These operations overwrite a res vector to reduce memory allocations. unsafe fn add(a: &Self, b: &Self, res: &mut Self); unsafe fn adds(a: &Self, b: &Self, res: &mut Self); unsafe fn andnot(a: &Self, b: &Self, res: &mut Self); unsafe fn cmpeq(a: &Self, b: &Self, res: &mut Self); unsafe fn min(a: &Self, b: &Self, res: &mut Self); unsafe fn max(a: &Self, b: &Self, res: &mut Self); unsafe fn shift_left_1(a: &Self, res: &mut Self); unsafe fn shift_right_1(a: &Self, res: &mut Self); /// `triple_argmin` will allocate memory and create a new Jewel vector. unsafe fn triple_argmin(sub: &Self, a_gap: &Self, b_gap: &Self, res_min: &mut Self) -> Self; unsafe fn triple_min_length(sub: &Self, a_gap: &Self, b_gap: &Self, sub_length: &Self, a_gap_length: &Self, b_gap_length: &Self, res_min: &mut Self, res_length: &mut Self); unsafe fn double_min_length(new_gap: &Self, res_cont_gap: &mut Self, new_gap_length: &Self, res_cont_gap_length: &mut Self); } // macros to help generate implementations for some of the Jewel vector functions macro_rules! operation_param2 { ($target:literal, $fn_name:ident, $intrinsic:ident) => { #[target_feature(enable = $target)] #[inline] unsafe fn $fn_name(a: &Self, b: &Self, res: &mut Self) { for i in 0..a.v.len() { *res.v.get_unchecked_mut(i) = $intrinsic(*a.v.get_unchecked(i), *b.v.get_unchecked(i)); } } }; } macro_rules! operation_mut_param2 { ($target:literal, $fn_name:ident, $intrinsic:ident) => { #[target_feature(enable = $target)] #[inline] unsafe fn $fn_name(&mut self, b: &Self) { for i in 0..self.v.len() { *self.v.get_unchecked_mut(i) = $intrinsic(*self.v.get_unchecked(i), *b.v.get_unchecked(i)); } } }; } /// N x 32 x 8 vector backed with 256-bit AVX vectors. macro_rules! create_avx_nx32x8 { ($name:ident, $num:literal) => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub struct $name { v: [__m256i; $num] } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl Jewel for $name { #[target_feature(enable = "avx2")] #[inline] unsafe fn repeating(val: u32, _len: usize) -> Self { let v = [_mm256_set1_epi8(val as i8); $num]; Self{ v: v } } #[target_feature(enable = "avx2")] #[inline] unsafe fn repeating_max(_len: usize) -> Self { let v = [_mm256_set1_epi8(-1i8); $num]; Self{ v: v } } #[inline] fn upper_bound(&self) -> usize { self.v.len() << 5 } #[inline] fn static_upper_bound() -> usize { $num << 5 } #[target_feature(enable = "avx2")] #[inline] unsafe fn slow_loadu(&mut self, idx: usize, ptr: *const u8, len: usize, reverse: bool) { if len == 0 { return; } let mut arr = [0u8; 32]; let arr_ptr = arr.as_mut_ptr() as *mut __m256i; let store_idx = if reverse {31} else {0}; let load_idx = if reverse {0} else {31}; for i in 0..len { let curr_idx = if reverse {idx - i} else {idx + i}; let arr_idx = curr_idx & 31; if arr_idx == store_idx || i == 0 { // use part of the original array _mm256_storeu_si256(arr_ptr, *self.v.get_unchecked(curr_idx >> 5)); } *arr.get_unchecked_mut(arr_idx) = *ptr.offset(i as isize); if arr_idx == load_idx || i == len - 1 { *self.v.get_unchecked_mut(curr_idx >> 5) = _mm256_loadu_si256(arr_ptr); } } } #[target_feature(enable = "avx2")] #[inline] unsafe fn slow_extract(&self, i: usize) -> u32 { let idx = i >> 5; let j = i & 31; let mut arr = [0u8; 32]; _mm256_storeu_si256(arr.as_mut_ptr() as *mut __m256i, *self.v.get_unchecked(idx)); *arr.get_unchecked(j) as u32 } #[target_feature(enable = "avx2")] #[inline] unsafe fn slow_insert(&mut self, i: usize, val: u32) { let idx = i >> 5; let j = i & 31; let mut arr = [0u8; 32]; let arr_ptr = arr.as_mut_ptr() as *mut __m256i; _mm256_storeu_si256(arr_ptr, *self.v.get_unchecked(idx)); *arr.get_unchecked_mut(j) = val as u8; *self.v.get_unchecked_mut(idx) = _mm256_loadu_si256(arr_ptr); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_0(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi8(*self.v.get_unchecked(last), val as i8, 31i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_1(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi8(*self.v.get_unchecked(last), val as i8, 30i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_2(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi8(*self.v.get_unchecked(last), val as i8, 29i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_max(&mut self) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi8(*self.v.get_unchecked(last), -1i8, 31i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_first(&mut self, val: u32) { *self.v.get_unchecked_mut(0) = _mm256_insert_epi8(*self.v.get_unchecked(0), val as i8, 0i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_first_max(&mut self) { *self.v.get_unchecked_mut(0) = _mm256_insert_epi8(*self.v.get_unchecked(0), -1i8, 0i32); } operation_mut_param2!("avx2", add_mut, _mm256_add_epi8); operation_mut_param2!("avx2", adds_mut, _mm256_adds_epu8); operation_mut_param2!("avx2", and_mut, _mm256_and_si256); operation_mut_param2!("avx2", andnot_mut, _mm256_andnot_si256); operation_mut_param2!("avx2", cmpeq_mut, _mm256_cmpeq_epi8); operation_mut_param2!("avx2", min_mut, _mm256_min_epu8); operation_mut_param2!("avx2", max_mut, _mm256_max_epu8); #[target_feature(enable = "avx2")] #[inline] unsafe fn blendv_mut(&mut self, b: &Self, mask: &Self) { for i in 0..self.v.len() { *self.v.get_unchecked_mut(i) = _mm256_blendv_epi8(*self.v.get_unchecked(i), *b.v.get_unchecked(i), *mask.v.get_unchecked(i)); } } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left_1_mut(&mut self) { for i in 0..(self.v.len() - 1) { let curr = *self.v.get_unchecked(i); // permute concatenates the second half of the current vector and the first half of the next vector *self.v.get_unchecked_mut(i) = _mm256_alignr_epi8( _mm256_permute2x128_si256(curr, *self.v.get_unchecked(i + 1), 0b00100001i32), curr, 1i32); } // last one gets to shift in zeros let last = self.v.len() - 1; let curr = *self.v.get_unchecked(last); // permute concatenates the second half of the last vector and a vector of zeros *self.v.get_unchecked_mut(last) = _mm256_alignr_epi8(_mm256_permute2x128_si256(curr, curr, 0b10000001i32), curr, 1i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left_2_mut(&mut self) { for i in 0..(self.v.len() - 1) { let curr = *self.v.get_unchecked(i); // permute concatenates the second half of the current vector and the first half of the next vector *self.v.get_unchecked_mut(i) = _mm256_alignr_epi8( _mm256_permute2x128_si256(curr, *self.v.get_unchecked(i + 1), 0b00100001i32), curr, 2i32); } // last one gets to shift in zeros let last = self.v.len() - 1; let curr = *self.v.get_unchecked(last); // permute concatenates the second half of the last vector and a vector of zeros *self.v.get_unchecked_mut(last) = _mm256_alignr_epi8(_mm256_permute2x128_si256(curr, curr, 0b10000001i32), curr, 2i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_right_1_mut(&mut self) { for i in (1..self.v.len()).rev() { let curr = *self.v.get_unchecked(i); // permute concatenates the second half of the previous vector and the first half of the current vector *self.v.get_unchecked_mut(i) = _mm256_alignr_epi8( curr, _mm256_permute2x128_si256(curr, *self.v.get_unchecked(i - 1), 0b00000011i32), 15i32); } // first one gets to shift in zeros let curr = *self.v.get_unchecked(0); // permute concatenates a vector of zeros and the first half of the first vector *self.v.get_unchecked_mut(0) = _mm256_alignr_epi8(curr, _mm256_permute2x128_si256(curr, curr, 0b00001000i32), 15i32); } operation_param2!("avx2", add, _mm256_add_epi8); operation_param2!("avx2", adds, _mm256_adds_epu8); operation_param2!("avx2", andnot, _mm256_andnot_si256); operation_param2!("avx2", cmpeq, _mm256_cmpeq_epi8); operation_param2!("avx2", min, _mm256_min_epu8); operation_param2!("avx2", max, _mm256_max_epu8); #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left_1(a: &Self, res: &mut Self) { for i in 0..(a.v.len() - 1) { let curr = *a.v.get_unchecked(i); // permute concatenates the second half of the current vector and the first half of the next vector *res.v.get_unchecked_mut(i) = _mm256_alignr_epi8( _mm256_permute2x128_si256(curr, *a.v.get_unchecked(i + 1), 0b00100001i32), curr, 1i32); } // last one gets to shift in zeros let last = a.v.len() - 1; let curr = *a.v.get_unchecked(last); // permute concatenates the second half of the last vector and a vector of zeros *res.v.get_unchecked_mut(last) = _mm256_alignr_epi8(_mm256_permute2x128_si256(curr, curr, 0b10000001i32), curr, 1i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_right_1(a: &Self, res: &mut Self) { for i in (1..a.v.len()).rev() { let curr = *a.v.get_unchecked(i); // permute concatenates the second half of the previous vector and the first half of the current vector *res.v.get_unchecked_mut(i) = _mm256_alignr_epi8( curr, _mm256_permute2x128_si256(curr, *a.v.get_unchecked(i - 1), 0b00000011i32), 15i32); } // first one gets to shift in zeros let curr = *a.v.get_unchecked(0); // permute concatenates a vector of zeros and the first half of the first vector *res.v.get_unchecked_mut(0) = _mm256_alignr_epi8(curr, _mm256_permute2x128_si256(curr, curr, 0b00001000i32), 15i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn triple_argmin(sub: &Self, a_gap: &Self, b_gap: &Self, res_min: &mut Self) -> Self { // return the edit used in addition to doing a min operation let mut v = [_mm256_undefined_si256(); $num]; let twos = _mm256_set1_epi8(2); for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let res_min1 = _mm256_min_epu8(a_gap, b_gap); // a gap: 2 + -1 = 1, b gap: 2 + 0 = 2 let res_arg1 = _mm256_add_epi8(twos, _mm256_cmpeq_epi8(a_gap, res_min1)); let res_min2 = _mm256_min_epu8(sub, res_min1); // sub: 0 let res_arg2 = _mm256_andnot_si256(_mm256_cmpeq_epi8(sub, res_min2), res_arg1); *res_min.v.get_unchecked_mut(i) = res_min2; *v.get_unchecked_mut(i) = res_arg2; } Self{ v: v } } #[target_feature(enable = "avx2")] #[inline] unsafe fn triple_min_length(sub: &Self, a_gap: &Self, b_gap: &Self, sub_length: &Self, a_gap_length: &Self, b_gap_length: &Self, res_min: &mut Self, res_length: &mut Self) { // choose the length based on which edit is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let sub_length = *sub_length.v.get_unchecked(i); let a_gap_length = *a_gap_length.v.get_unchecked(i); let b_gap_length = *b_gap_length.v.get_unchecked(i); let res_min1 = _mm256_min_epu8(a_gap, b_gap); let a_b_gt_mask = _mm256_cmpeq_epi8(a_gap, res_min1); // a gap: -1, b gap: 0 let mut res_length1 = _mm256_blendv_epi8(b_gap_length, a_gap_length, a_b_gt_mask); // lengths based on edits let a_b_eq_mask = _mm256_cmpeq_epi8(a_gap, b_gap); // equal: -1 let a_b_max_len = _mm256_max_epu8(a_gap_length, b_gap_length); res_length1 = _mm256_blendv_epi8(res_length1, a_b_max_len, a_b_eq_mask); // maximize length if edits equal let res_min2 = _mm256_min_epu8(sub, res_min1); let sub_gt_mask = _mm256_cmpeq_epi8(sub, res_min2); // sub: -1, prev a or b gap: 0 let mut res_length2 = _mm256_blendv_epi8(res_length1, sub_length, sub_gt_mask); // length based on edits let sub_eq_mask = _mm256_cmpeq_epi8(sub, res_min1); let sub_max_len = _mm256_max_epu8(sub_length, res_length1); res_length2 = _mm256_blendv_epi8(res_length2, sub_max_len, sub_eq_mask); // maximize length if edits equal *res_min.v.get_unchecked_mut(i) = res_min2; *res_length.v.get_unchecked_mut(i) = res_length2; } } #[target_feature(enable = "avx2")] #[inline] unsafe fn double_min_length(new_gap: &Self, res_cont_gap: &mut Self, new_gap_length: &Self, res_cont_gap_length: &mut Self) { // choose the length based on which gap type is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..new_gap.v.len() { let new_gap = *new_gap.v.get_unchecked(i); let cont_gap = *res_cont_gap.v.get_unchecked(i); let new_gap_length = *new_gap_length.v.get_unchecked(i); let cont_gap_length = *res_cont_gap_length.v.get_unchecked(i); let res_min = _mm256_min_epu8(new_gap, cont_gap); let new_cont_gt_mask = _mm256_cmpeq_epi8(new_gap, res_min); // new gap: -1, continue gap: 0 let mut res_length = _mm256_blendv_epi8(cont_gap_length, new_gap_length, new_cont_gt_mask); // lengths based on edits let new_cont_eq_mask = _mm256_cmpeq_epi8(new_gap, cont_gap); // equal: -1 let new_cont_max_len = _mm256_max_epu8(new_gap_length, cont_gap_length); res_length = _mm256_blendv_epi8(res_length, new_cont_max_len, new_cont_eq_mask); // maximize length if edits equal *res_cont_gap.v.get_unchecked_mut(i) = res_min; *res_cont_gap_length.v.get_unchecked_mut(i) = res_length; } } } // this implementation will probably only be used for debugging #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl fmt::Display for $name { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] #[inline] unsafe fn fmt_internal(s: &$name, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "[")?; let mut arr = [0u8; 32]; let arr_ptr = arr.as_mut_ptr() as *mut __m256i; for i in 0..(s.v.len() - 1) { _mm256_storeu_si256(arr_ptr, *s.v.get_unchecked(i)); for j in 0..32 { write!(f, "{:>3}, ", *arr.get_unchecked(j))?; } } // leftover elements _mm256_storeu_si256(arr_ptr, *s.v.get_unchecked(s.v.len() - 1)); let start = (s.v.len() - 1) << 5; for i in 0..(s.upper_bound() - start) { if i == s.upper_bound() - start - 1 { write!(f, "{:>3}", *arr.get_unchecked(i))?; }else{ write!(f, "{:>3}, ", *arr.get_unchecked(i))?; } } write!(f, "]") } unsafe { fmt_internal(self, f) } } } }; } // constant array size, so the compiler should unroll the loops create_avx_nx32x8!(Avx1x32x8, 1); create_avx_nx32x8!(Avx2x32x8, 2); create_avx_nx32x8!(Avx4x32x8, 4); create_avx_nx32x8!(Avx8x32x8, 8); /// N x 16 x 16 vector backed with 256-bit AVX vectors. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub struct AvxNx16x16 { v: Vec<__m256i> } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl Jewel for AvxNx16x16 { #[target_feature(enable = "avx2")] #[inline] unsafe fn repeating(val: u32, len: usize) -> Self { let v = vec![_mm256_set1_epi16(val as i16); (len >> 4) + if (len & 15) > 0 {1} else {0}]; Self{ v: v } } #[target_feature(enable = "avx2")] #[inline] unsafe fn repeating_max(len: usize) -> Self { let v = vec![_mm256_set1_epi16(-1i16); (len >> 4) + if (len & 15) > 0 {1} else {0}]; Self{ v: v } } #[inline] fn upper_bound(&self) -> usize { self.v.len() << 4 } #[inline] fn static_upper_bound() -> usize { unimplemented!() } #[target_feature(enable = "avx2")] #[inline] unsafe fn slow_loadu(&mut self, idx: usize, ptr: *const u8, len: usize, reverse: bool) { if len == 0 { return; } let mut arr = [0u16; 16]; let arr_ptr = arr.as_mut_ptr() as *mut __m256i; let store_idx = if reverse {15} else {0}; let load_idx = if reverse {0} else {15}; for i in 0..len { let curr_idx = if reverse {idx - i} else {idx + i}; let arr_idx = curr_idx & 15; if arr_idx == store_idx || i == 0 { _mm256_storeu_si256(arr_ptr, *self.v.get_unchecked(curr_idx >> 4)); } *arr.get_unchecked_mut(arr_idx) = *ptr.offset(i as isize) as u16; if arr_idx == load_idx || i == len - 1 { *self.v.get_unchecked_mut(curr_idx >> 4) = _mm256_loadu_si256(arr_ptr); } } } #[target_feature(enable = "avx2")] #[inline] unsafe fn slow_extract(&self, i: usize) -> u32 { let idx = i >> 4; let j = i & 15; let mut arr = [0u16; 16]; _mm256_storeu_si256(arr.as_mut_ptr() as *mut __m256i, *self.v.get_unchecked(idx)); *arr.get_unchecked(j) as u32 } #[target_feature(enable = "avx2")] #[inline] unsafe fn slow_insert(&mut self, i: usize, val: u32) { let idx = i >> 4; let j = i & 15; let mut arr = [0u16; 16]; let arr_ptr = arr.as_mut_ptr() as *mut __m256i; _mm256_storeu_si256(arr_ptr, *self.v.get_unchecked(idx)); *arr.get_unchecked_mut(j) = val as u16; *self.v.get_unchecked_mut(idx) = _mm256_loadu_si256(arr_ptr); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_0(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi16(*self.v.get_unchecked(last), val as i16, 15i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_1(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi16(*self.v.get_unchecked(last), val as i16, 14i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_2(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi16(*self.v.get_unchecked(last), val as i16, 13i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_max(&mut self) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi16(*self.v.get_unchecked(last), -1i16, 15i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_first(&mut self, val: u32) { *self.v.get_unchecked_mut(0) = _mm256_insert_epi16(*self.v.get_unchecked(0), val as i16, 0i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_first_max(&mut self) { *self.v.get_unchecked_mut(0) = _mm256_insert_epi16(*self.v.get_unchecked(0), -1i16, 0i32); } operation_mut_param2!("avx2", add_mut, _mm256_add_epi16); operation_mut_param2!("avx2", adds_mut, _mm256_adds_epu16); operation_mut_param2!("avx2", and_mut, _mm256_and_si256); operation_mut_param2!("avx2", andnot_mut, _mm256_andnot_si256); operation_mut_param2!("avx2", cmpeq_mut, _mm256_cmpeq_epi16); operation_mut_param2!("avx2", min_mut, _mm256_min_epu16); operation_mut_param2!("avx2", max_mut, _mm256_max_epu16); #[target_feature(enable = "avx2")] #[inline] unsafe fn blendv_mut(&mut self, b: &Self, mask: &Self) { for i in 0..self.v.len() { *self.v.get_unchecked_mut(i) = _mm256_blendv_epi8(*self.v.get_unchecked(i), *b.v.get_unchecked(i), *mask.v.get_unchecked(i)); } } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left_1_mut(&mut self) { for i in 0..(self.v.len() - 1) { let curr = *self.v.get_unchecked(i); // permute concatenates the second half of the current vector and the first half of the next vector *self.v.get_unchecked_mut(i) = _mm256_alignr_epi8( _mm256_permute2x128_si256(curr, *self.v.get_unchecked(i + 1), 0b00100001i32), curr, 2i32); } // last one gets to shift in zeros let last = self.v.len() - 1; let curr = *self.v.get_unchecked(last); // permute concatenates the second half of the last vector and a vector of zeros *self.v.get_unchecked_mut(last) = _mm256_alignr_epi8(_mm256_permute2x128_si256(curr, curr, 0b10000001i32), curr, 2i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left_2_mut(&mut self) { for i in 0..(self.v.len() - 1) { let curr = *self.v.get_unchecked(i); // permute concatenates the second half of the current vector and the first half of the next vector *self.v.get_unchecked_mut(i) = _mm256_alignr_epi8( _mm256_permute2x128_si256(curr, *self.v.get_unchecked(i + 1), 0b00100001i32), curr, 4i32); } // last one gets to shift in zeros let last = self.v.len() - 1; let curr = *self.v.get_unchecked(last); // permute concatenates the second half of the last vector and a vector of zeros *self.v.get_unchecked_mut(last) = _mm256_alignr_epi8(_mm256_permute2x128_si256(curr, curr, 0b10000001i32), curr, 4i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_right_1_mut(&mut self) { for i in (1..self.v.len()).rev() { let curr = *self.v.get_unchecked(i); // permute concatenates the second half of the previous vector and the first half of the current vector *self.v.get_unchecked_mut(i) = _mm256_alignr_epi8( curr, _mm256_permute2x128_si256(curr, *self.v.get_unchecked(i - 1), 0b00000011i32), 14i32); } // first one gets to shift in zeros let curr = *self.v.get_unchecked(0); // permute concatenates a vector of zeros and the first half of the first vector *self.v.get_unchecked_mut(0) = _mm256_alignr_epi8(curr, _mm256_permute2x128_si256(curr, curr, 0b00001000i32), 14i32); } operation_param2!("avx2", add, _mm256_add_epi16); operation_param2!("avx2", adds, _mm256_adds_epu16); operation_param2!("avx2", andnot, _mm256_andnot_si256); operation_param2!("avx2", cmpeq, _mm256_cmpeq_epi16); operation_param2!("avx2", min, _mm256_min_epu16); operation_param2!("avx2", max, _mm256_max_epu16); #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left_1(a: &Self, res: &mut Self) { for i in 0..(a.v.len() - 1) { let curr = *a.v.get_unchecked(i); // permute concatenates the second half of the current vector and the first half of the next vector *res.v.get_unchecked_mut(i) = _mm256_alignr_epi8( _mm256_permute2x128_si256(curr, *a.v.get_unchecked(i + 1), 0b00100001i32), curr, 2i32); } // last one gets to shift in zeros let last = a.v.len() - 1; let curr = *a.v.get_unchecked(last); // permute concatenates the second half of the last vector and a vector of zeros *res.v.get_unchecked_mut(last) = _mm256_alignr_epi8(_mm256_permute2x128_si256(curr, curr, 0b10000001i32), curr, 2i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_right_1(a: &Self, res: &mut Self) { for i in (1..a.v.len()).rev() { let curr = *a.v.get_unchecked(i); // permute concatenates the second half of the previous vector and the first half of the current vector *res.v.get_unchecked_mut(i) = _mm256_alignr_epi8( curr, _mm256_permute2x128_si256(curr, *a.v.get_unchecked(i - 1), 0b00000011i32), 14i32); } // first one gets to shift in zeros let curr = *a.v.get_unchecked(0); // permute concatenates a vector of zeros and the first half of the first vector *res.v.get_unchecked_mut(0) = _mm256_alignr_epi8(curr, _mm256_permute2x128_si256(curr, curr, 0b00001000i32), 14i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn triple_argmin(sub: &Self, a_gap: &Self, b_gap: &Self, res_min: &mut Self) -> Self { // return the edit used in addition to doing a min operation let mut v = Vec::with_capacity(sub.v.len()); let twos = _mm256_set1_epi16(2); for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let res_min1 = _mm256_min_epu16(a_gap, b_gap); // a gap: 2 + -1 = 1, b gap: 2 + 0 = 2 let res_arg1 = _mm256_add_epi16(twos, _mm256_cmpeq_epi16(a_gap, res_min1)); let res_min2 = _mm256_min_epu16(sub, res_min1); // sub: 0 let res_arg2 = _mm256_andnot_si256(_mm256_cmpeq_epi16(sub, res_min2), res_arg1); *res_min.v.get_unchecked_mut(i) = res_min2; v.push(res_arg2); } Self{ v: v } } #[target_feature(enable = "avx2")] #[inline] unsafe fn triple_min_length(sub: &Self, a_gap: &Self, b_gap: &Self, sub_length: &Self, a_gap_length: &Self, b_gap_length: &Self, res_min: &mut Self, res_length: &mut Self) { // choose the length based on which edit is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let sub_length = *sub_length.v.get_unchecked(i); let a_gap_length = *a_gap_length.v.get_unchecked(i); let b_gap_length = *b_gap_length.v.get_unchecked(i); let res_min1 = _mm256_min_epu16(a_gap, b_gap); let a_b_gt_mask = _mm256_cmpeq_epi16(a_gap, res_min1); // a gap: -1, b gap: 0 let mut res_length1 = _mm256_blendv_epi8(b_gap_length, a_gap_length, a_b_gt_mask); // lengths based on edits let a_b_eq_mask = _mm256_cmpeq_epi16(a_gap, b_gap); // equal: -1 let a_b_max_len = _mm256_max_epu16(a_gap_length, b_gap_length); res_length1 = _mm256_blendv_epi8(res_length1, a_b_max_len, a_b_eq_mask); // maximize length if edits equal let res_min2 = _mm256_min_epu16(sub, res_min1); let sub_gt_mask = _mm256_cmpeq_epi16(sub, res_min2); // sub: -1, prev a or b gap: 0 let mut res_length2 = _mm256_blendv_epi8(res_length1, sub_length, sub_gt_mask); // length based on edits let sub_eq_mask = _mm256_cmpeq_epi16(sub, res_min1); let sub_max_len = _mm256_max_epu16(sub_length, res_length1); res_length2 = _mm256_blendv_epi8(res_length2, sub_max_len, sub_eq_mask); // maximize length if edits equal *res_min.v.get_unchecked_mut(i) = res_min2; *res_length.v.get_unchecked_mut(i) = res_length2; } } #[target_feature(enable = "avx2")] #[inline] unsafe fn double_min_length(new_gap: &Self, res_cont_gap: &mut Self, new_gap_length: &Self, res_cont_gap_length: &mut Self) { // choose the length based on which gap type is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..new_gap.v.len() { let new_gap = *new_gap.v.get_unchecked(i); let cont_gap = *res_cont_gap.v.get_unchecked(i); let new_gap_length = *new_gap_length.v.get_unchecked(i); let cont_gap_length = *res_cont_gap_length.v.get_unchecked(i); let res_min = _mm256_min_epu16(new_gap, cont_gap); let new_cont_gt_mask = _mm256_cmpeq_epi16(new_gap, res_min); // new gap: -1, continue gap: 0 let mut res_length = _mm256_blendv_epi8(cont_gap_length, new_gap_length, new_cont_gt_mask); // lengths based on edits let new_cont_eq_mask = _mm256_cmpeq_epi16(new_gap, cont_gap); // equal: -1 let new_cont_max_len = _mm256_max_epu16(new_gap_length, cont_gap_length); res_length = _mm256_blendv_epi8(res_length, new_cont_max_len, new_cont_eq_mask); // maximize length if edits equal *res_cont_gap.v.get_unchecked_mut(i) = res_min; *res_cont_gap_length.v.get_unchecked_mut(i) = res_length; } } } // this implementation will probably only be used for debugging #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl fmt::Display for AvxNx16x16 { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] #[inline] unsafe fn fmt_internal(s: &AvxNx16x16, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "[")?; let mut arr = [0u16; 16]; let arr_ptr = arr.as_mut_ptr() as *mut __m256i; for i in 0..(s.v.len() - 1) { _mm256_storeu_si256(arr_ptr, *s.v.get_unchecked(i)); for j in 0..16 { write!(f, "{:>3}, ", *arr.get_unchecked(j))?; } } // leftover elements _mm256_storeu_si256(arr_ptr, *s.v.get_unchecked(s.v.len() - 1)); let start = (s.v.len() - 1) << 4; for i in 0..(s.upper_bound() - start) { if i == s.upper_bound() - start - 1 { write!(f, "{:>3}", *arr.get_unchecked(i))?; }else{ write!(f, "{:>3}, ", *arr.get_unchecked(i))?; } } write!(f, "]") } unsafe { fmt_internal(self, f) } } } /// N x 8 x 32 vector backed with 256-bit AVX vectors. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub struct AvxNx8x32 { v: Vec<__m256i> } /// Workaround for the lack of the _mm256_adds_epu32 intrinsic. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] #[inline] unsafe fn _mm256_adds_epu32(a: __m256i, b: __m256i) -> __m256i { let sum = _mm256_add_epi32(a, b); let min = _mm256_min_epu32(a, sum); let eq = _mm256_cmpeq_epi32(a, min); // if the sum is less than a, then saturate // note: sum is either greater than both a and b or less than both _mm256_blendv_epi8(_mm256_set1_epi32(-1i32), sum, eq) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl Jewel for AvxNx8x32 { #[target_feature(enable = "avx2")] #[inline] unsafe fn repeating(val: u32, len: usize) -> Self { let v = vec![_mm256_set1_epi32(val as i32); (len >> 3) + if (len & 7) > 0 {1} else {0}]; Self{ v: v } } #[target_feature(enable = "avx2")] #[inline] unsafe fn repeating_max(len: usize) -> Self { let v = vec![_mm256_set1_epi32(-1i32); (len >> 3) + if (len & 7) > 0 {1} else {0}]; Self{ v: v } } #[inline] fn upper_bound(&self) -> usize { self.v.len() << 3 } #[inline] fn static_upper_bound() -> usize { unimplemented!() } #[target_feature(enable = "avx2")] #[inline] unsafe fn slow_loadu(&mut self, idx: usize, ptr: *const u8, len: usize, reverse: bool) { if len == 0 { return; } let mut arr = [0u32; 8]; let arr_ptr = arr.as_mut_ptr() as *mut __m256i; let store_idx = if reverse {7} else {0}; let load_idx = if reverse {0} else {7}; for i in 0..len { let curr_idx = if reverse {idx - i} else {idx + i}; let arr_idx = curr_idx & 7; if arr_idx == store_idx || i == 0 { _mm256_storeu_si256(arr_ptr, *self.v.get_unchecked(curr_idx >> 3)); } *arr.get_unchecked_mut(arr_idx) = *ptr.offset(i as isize) as u32; if arr_idx == load_idx || i == len - 1 { *self.v.get_unchecked_mut(curr_idx >> 3) = _mm256_loadu_si256(arr_ptr); } } } #[target_feature(enable = "avx2")] #[inline] unsafe fn slow_extract(&self, i: usize) -> u32 { let idx = i >> 3; let j = i & 7; let mut arr = [0u32; 8]; _mm256_storeu_si256(arr.as_mut_ptr() as *mut __m256i, *self.v.get_unchecked(idx)); *arr.get_unchecked(j) } #[target_feature(enable = "avx2")] #[inline] unsafe fn slow_insert(&mut self, i: usize, val: u32) { let idx = i >> 3; let j = i & 7; let mut arr = [0u32; 8]; let arr_ptr = arr.as_mut_ptr() as *mut __m256i; _mm256_storeu_si256(arr_ptr, *self.v.get_unchecked(idx)); *arr.get_unchecked_mut(j) = val; *self.v.get_unchecked_mut(idx) = _mm256_loadu_si256(arr_ptr); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_0(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi32(*self.v.get_unchecked(last), val as i32, 7i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_1(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi32(*self.v.get_unchecked(last), val as i32, 6i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_2(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi32(*self.v.get_unchecked(last), val as i32, 5i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_last_max(&mut self) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm256_insert_epi32(*self.v.get_unchecked(last), -1i32, 7i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_first(&mut self, val: u32) { *self.v.get_unchecked_mut(0) = _mm256_insert_epi32(*self.v.get_unchecked(0), val as i32, 0i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn insert_first_max(&mut self) { *self.v.get_unchecked_mut(0) = _mm256_insert_epi32(*self.v.get_unchecked(0), -1i32, 0i32); } operation_mut_param2!("avx2", add_mut, _mm256_add_epi32); operation_mut_param2!("avx2", adds_mut, _mm256_adds_epu32); operation_mut_param2!("avx2", and_mut, _mm256_and_si256); operation_mut_param2!("avx2", andnot_mut, _mm256_andnot_si256); operation_mut_param2!("avx2", cmpeq_mut, _mm256_cmpeq_epi32); operation_mut_param2!("avx2", min_mut, _mm256_min_epu32); operation_mut_param2!("avx2", max_mut, _mm256_max_epu32); #[target_feature(enable = "avx2")] #[inline] unsafe fn blendv_mut(&mut self, b: &Self, mask: &Self) { for i in 0..self.v.len() { *self.v.get_unchecked_mut(i) = _mm256_blendv_epi8(*self.v.get_unchecked(i), *b.v.get_unchecked(i), *mask.v.get_unchecked(i)); } } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left_1_mut(&mut self) { for i in 0..(self.v.len() - 1) { let curr = *self.v.get_unchecked(i); // permute concatenates the second half of the current vector and the first half of the next vector *self.v.get_unchecked_mut(i) = _mm256_alignr_epi8( _mm256_permute2x128_si256(curr, *self.v.get_unchecked(i + 1), 0b00100001i32), curr, 4i32); } // last one gets to shift in zeros let last = self.v.len() - 1; let curr = *self.v.get_unchecked(last); // permute concatenates the second half of the last vector and a vector of zeros *self.v.get_unchecked_mut(last) = _mm256_alignr_epi8(_mm256_permute2x128_si256(curr, curr, 0b10000001i32), curr, 4i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left_2_mut(&mut self) { for i in 0..(self.v.len() - 1) { let curr = *self.v.get_unchecked(i); // permute concatenates the second half of the current vector and the first half of the next vector *self.v.get_unchecked_mut(i) = _mm256_alignr_epi8( _mm256_permute2x128_si256(curr, *self.v.get_unchecked(i + 1), 0b00100001i32), curr, 8i32); } // last one gets to shift in zeros let last = self.v.len() - 1; let curr = *self.v.get_unchecked(last); // permute concatenates the second half of the last vector and a vector of zeros *self.v.get_unchecked_mut(last) = _mm256_alignr_epi8(_mm256_permute2x128_si256(curr, curr, 0b10000001i32), curr, 8i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_right_1_mut(&mut self) { for i in (1..self.v.len()).rev() { let curr = *self.v.get_unchecked(i); // permute concatenates the second half of the previous vector and the first half of the current vector *self.v.get_unchecked_mut(i) = _mm256_alignr_epi8( curr, _mm256_permute2x128_si256(curr, *self.v.get_unchecked(i - 1), 0b00000011i32), 12i32); } // first one gets to shift in zeros let curr = *self.v.get_unchecked(0); // permute concatenates a vector of zeros and the first half of the first vector *self.v.get_unchecked_mut(0) = _mm256_alignr_epi8(curr, _mm256_permute2x128_si256(curr, curr, 0b00001000i32), 12i32); } operation_param2!("avx2", add, _mm256_add_epi32); operation_param2!("avx2", adds, _mm256_adds_epu32); operation_param2!("avx2", andnot, _mm256_andnot_si256); operation_param2!("avx2", cmpeq, _mm256_cmpeq_epi32); operation_param2!("avx2", min, _mm256_min_epu32); operation_param2!("avx2", max, _mm256_max_epu32); #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_left_1(a: &Self, res: &mut Self) { for i in 0..(a.v.len() - 1) { let curr = *a.v.get_unchecked(i); // permute concatenates the second half of the current vector and the first half of the next vector *res.v.get_unchecked_mut(i) = _mm256_alignr_epi8( _mm256_permute2x128_si256(curr, *a.v.get_unchecked(i + 1), 0b00100001i32), curr, 4i32); } // last one gets to shift in zeros let last = a.v.len() - 1; let curr = *a.v.get_unchecked(last); // permute concatenates the second half of the last vector and a vector of zeros *res.v.get_unchecked_mut(last) = _mm256_alignr_epi8(_mm256_permute2x128_si256(curr, curr, 0b10000001i32), curr, 4i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn shift_right_1(a: &Self, res: &mut Self) { for i in (1..a.v.len()).rev() { let curr = *a.v.get_unchecked(i); // permute concatenates the second half of the previous vector and the first half of the current vector *res.v.get_unchecked_mut(i) = _mm256_alignr_epi8( curr, _mm256_permute2x128_si256(curr, *a.v.get_unchecked(i - 1), 0b00000011i32), 12i32); } // first one gets to shift in zeros let curr = *a.v.get_unchecked(0); // permute concatenates a vector of zeros and the first half of the first vector *res.v.get_unchecked_mut(0) = _mm256_alignr_epi8(curr, _mm256_permute2x128_si256(curr, curr, 0b00001000i32), 12i32); } #[target_feature(enable = "avx2")] #[inline] unsafe fn triple_argmin(sub: &Self, a_gap: &Self, b_gap: &Self, res_min: &mut Self) -> Self { // return the edit used in addition to doing a min operation let mut v = Vec::with_capacity(sub.v.len()); let twos = _mm256_set1_epi32(2); for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let res_min1 = _mm256_min_epu32(a_gap, b_gap); // a gap: 2 + -1 = 1, b gap: 2 + 0 = 2 let res_arg1 = _mm256_add_epi32(twos, _mm256_cmpeq_epi32(a_gap, res_min1)); let res_min2 = _mm256_min_epu32(sub, res_min1); // sub: 0 let res_arg2 = _mm256_andnot_si256(_mm256_cmpeq_epi32(sub, res_min2), res_arg1); *res_min.v.get_unchecked_mut(i) = res_min2; v.push(res_arg2); } Self{ v: v } } #[target_feature(enable = "avx2")] #[inline] unsafe fn triple_min_length(sub: &Self, a_gap: &Self, b_gap: &Self, sub_length: &Self, a_gap_length: &Self, b_gap_length: &Self, res_min: &mut Self, res_length: &mut Self) { // choose the length based on which edit is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let sub_length = *sub_length.v.get_unchecked(i); let a_gap_length = *a_gap_length.v.get_unchecked(i); let b_gap_length = *b_gap_length.v.get_unchecked(i); let res_min1 = _mm256_min_epu32(a_gap, b_gap); let a_b_gt_mask = _mm256_cmpeq_epi32(a_gap, res_min1); // a gap: -1, b gap: 0 let mut res_length1 = _mm256_blendv_epi8(b_gap_length, a_gap_length, a_b_gt_mask); // lengths based on edits let a_b_eq_mask = _mm256_cmpeq_epi32(a_gap, b_gap); // equal: -1 let a_b_max_len = _mm256_max_epu32(a_gap_length, b_gap_length); res_length1 = _mm256_blendv_epi8(res_length1, a_b_max_len, a_b_eq_mask); // maximize length if edits equal let res_min2 = _mm256_min_epu32(sub, res_min1); let sub_gt_mask = _mm256_cmpeq_epi32(sub, res_min2); // sub: -1, prev a or b gap: 0 let mut res_length2 = _mm256_blendv_epi8(res_length1, sub_length, sub_gt_mask); // length based on edits let sub_eq_mask = _mm256_cmpeq_epi32(sub, res_min1); let sub_max_len = _mm256_max_epu32(sub_length, res_length1); res_length2 = _mm256_blendv_epi8(res_length2, sub_max_len, sub_eq_mask); // maximize length if edits equal *res_min.v.get_unchecked_mut(i) = res_min2; *res_length.v.get_unchecked_mut(i) = res_length2; } } #[target_feature(enable = "avx2")] #[inline] unsafe fn double_min_length(new_gap: &Self, res_cont_gap: &mut Self, new_gap_length: &Self, res_cont_gap_length: &mut Self) { // choose the length based on which gap type is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..new_gap.v.len() { let new_gap = *new_gap.v.get_unchecked(i); let cont_gap = *res_cont_gap.v.get_unchecked(i); let new_gap_length = *new_gap_length.v.get_unchecked(i); let cont_gap_length = *res_cont_gap_length.v.get_unchecked(i); let res_min = _mm256_min_epu32(new_gap, cont_gap); let new_cont_gt_mask = _mm256_cmpeq_epi32(new_gap, res_min); // new gap: -1, continue gap: 0 let mut res_length = _mm256_blendv_epi8(cont_gap_length, new_gap_length, new_cont_gt_mask); // lengths based on edits let new_cont_eq_mask = _mm256_cmpeq_epi32(new_gap, cont_gap); // equal: -1 let new_cont_max_len = _mm256_max_epu32(new_gap_length, cont_gap_length); res_length = _mm256_blendv_epi8(res_length, new_cont_max_len, new_cont_eq_mask); // maximize length if edits equal *res_cont_gap.v.get_unchecked_mut(i) = res_min; *res_cont_gap_length.v.get_unchecked_mut(i) = res_length; } } } // this implementation will probably only be used for debugging #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl fmt::Display for AvxNx8x32 { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2")] #[inline] unsafe fn fmt_internal(s: &AvxNx8x32, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "[")?; let mut arr = [0u32; 8]; let arr_ptr = arr.as_mut_ptr() as *mut __m256i; for i in 0..(s.v.len() - 1) { _mm256_storeu_si256(arr_ptr, *s.v.get_unchecked(i)); for j in 0..8 { write!(f, "{:>3}, ", *arr.get_unchecked(j))?; } } // leftover elements _mm256_storeu_si256(arr_ptr, *s.v.get_unchecked(s.v.len() - 1)); let start = (s.v.len() - 1) << 3; for i in 0..(s.upper_bound() - start) { if i == s.upper_bound() - start - 1 { write!(f, "{:>3}", *arr.get_unchecked(i))?; }else{ write!(f, "{:>3}, ", *arr.get_unchecked(i))?; } } return write!(f, "]"); } unsafe { fmt_internal(self, f) } } } /// N x 16 x 8 vector backed with 128-bit SSE vectors. macro_rules! create_sse_nx16x8 { ($name:ident, $num:literal) => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub struct $name { v: [__m128i; $num] } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl Jewel for $name { #[target_feature(enable = "sse4.1")] #[inline] unsafe fn repeating(val: u32, _len: usize) -> Self { let v = [_mm_set1_epi8(val as i8); $num]; Self{ v: v } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn repeating_max(_len: usize) -> Self { let v = [_mm_set1_epi8(-1i8); $num]; Self{ v: v } } #[inline] fn upper_bound(&self) -> usize { self.v.len() << 4 } #[inline] fn static_upper_bound() -> usize { $num << 4 } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn slow_loadu(&mut self, idx: usize, ptr: *const u8, len: usize, reverse: bool) { if len == 0 { return; } let mut arr = [0u8; 16]; let arr_ptr = arr.as_mut_ptr() as *mut __m128i; let store_idx = if reverse {15} else {0}; let load_idx = if reverse {0} else {15}; for i in 0..len { let curr_idx = if reverse {idx - i} else {idx + i}; let arr_idx = curr_idx & 15; if arr_idx == store_idx || i == 0 { _mm_storeu_si128(arr_ptr, *self.v.get_unchecked(curr_idx >> 4)); } *arr.get_unchecked_mut(arr_idx) = *ptr.offset(i as isize); if arr_idx == load_idx || i == len - 1 { *self.v.get_unchecked_mut(curr_idx >> 4) = _mm_loadu_si128(arr_ptr); } } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn slow_extract(&self, i: usize) -> u32 { let idx = i >> 4; let j = i & 15; let mut arr = [0u8; 16]; _mm_storeu_si128(arr.as_mut_ptr() as *mut __m128i, *self.v.get_unchecked(idx)); *arr.get_unchecked(j) as u32 } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn slow_insert(&mut self, i: usize, val: u32) { let idx = i >> 4; let j = i & 15; let mut arr = [0u8; 16]; let arr_ptr = arr.as_mut_ptr() as *mut __m128i; _mm_storeu_si128(arr_ptr, *self.v.get_unchecked(idx)); *arr.get_unchecked_mut(j) = val as u8; *self.v.get_unchecked_mut(idx) = _mm_loadu_si128(arr_ptr); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_0(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi8(*self.v.get_unchecked(last), val as i32, 15i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_1(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi8(*self.v.get_unchecked(last), val as i32, 14i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_2(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi8(*self.v.get_unchecked(last), val as i32, 13i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_max(&mut self) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi8(*self.v.get_unchecked(last), u8::MAX as i32, 15i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_first(&mut self, val: u32) { *self.v.get_unchecked_mut(0) = _mm_insert_epi8(*self.v.get_unchecked(0), val as i32, 0i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_first_max(&mut self) { *self.v.get_unchecked_mut(0) = _mm_insert_epi8(*self.v.get_unchecked(0), u8::MAX as i32, 0i32); } operation_mut_param2!("sse4.1", add_mut, _mm_add_epi8); operation_mut_param2!("sse4.1", adds_mut, _mm_adds_epu8); operation_mut_param2!("sse4.1", and_mut, _mm_and_si128); operation_mut_param2!("sse4.1", andnot_mut, _mm_andnot_si128); operation_mut_param2!("sse4.1", cmpeq_mut, _mm_cmpeq_epi8); operation_mut_param2!("sse4.1", min_mut, _mm_min_epu8); operation_mut_param2!("sse4.1", max_mut, _mm_max_epu8); #[target_feature(enable = "sse4.1")] #[inline] unsafe fn blendv_mut(&mut self, b: &Self, mask: &Self) { for i in 0..self.v.len() { *self.v.get_unchecked_mut(i) = _mm_blendv_epi8(*self.v.get_unchecked(i), *b.v.get_unchecked(i), *mask.v.get_unchecked(i)); } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_left_1_mut(&mut self) { for i in 0..(self.v.len() - 1) { *self.v.get_unchecked_mut(i) = _mm_alignr_epi8(*self.v.get_unchecked(i + 1), *self.v.get_unchecked(i), 1i32); } // last one gets to shift in zeros let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_srli_si128(*self.v.get_unchecked(last), 1i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_left_2_mut(&mut self) { for i in 0..(self.v.len() - 1) { *self.v.get_unchecked_mut(i) = _mm_alignr_epi8(*self.v.get_unchecked(i + 1), *self.v.get_unchecked(i), 2i32); } // last one gets to shift in zeros let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_srli_si128(*self.v.get_unchecked(last), 2i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_right_1_mut(&mut self) { for i in (1..self.v.len()).rev() { *self.v.get_unchecked_mut(i) = _mm_alignr_epi8(*self.v.get_unchecked(i), *self.v.get_unchecked(i - 1), 15i32); } // first one gets to shift in zeros *self.v.get_unchecked_mut(0) = _mm_slli_si128(*self.v.get_unchecked(0), 1i32); } operation_param2!("sse4.1", add, _mm_add_epi8); operation_param2!("sse4.1", adds, _mm_adds_epu8); operation_param2!("sse4.1", andnot, _mm_andnot_si128); operation_param2!("sse4.1", cmpeq, _mm_cmpeq_epi8); operation_param2!("sse4.1", min, _mm_min_epu8); operation_param2!("sse4.1", max, _mm_max_epu8); #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_left_1(a: &Self, res: &mut Self) { for i in 0..(a.v.len() - 1) { *res.v.get_unchecked_mut(i) = _mm_alignr_epi8(*a.v.get_unchecked(i + 1), *a.v.get_unchecked(i), 1i32); } // last one gets to shift in zeros let last = a.v.len() - 1; *res.v.get_unchecked_mut(last) = _mm_srli_si128(*a.v.get_unchecked(last), 1i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_right_1(a: &Self, res: &mut Self) { for i in (1..a.v.len()).rev() { *res.v.get_unchecked_mut(i) = _mm_alignr_epi8(*a.v.get_unchecked(i), *a.v.get_unchecked(i - 1), 15i32); } // first one gets to shift in zeros *res.v.get_unchecked_mut(0) = _mm_slli_si128(*a.v.get_unchecked(0), 1i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn triple_argmin(sub: &Self, a_gap: &Self, b_gap: &Self, res_min: &mut Self) -> Self { // return the edit used in addition to doing a min operation let mut v = [_mm_undefined_si128(); $num]; let twos = _mm_set1_epi8(2); for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let res_min1 = _mm_min_epu8(a_gap, b_gap); // a gap: 2 + -1 = 1, b gap: 2 + 0 = 2 let res_arg1 = _mm_add_epi8(twos, _mm_cmpeq_epi8(a_gap, res_min1)); let res_min2 = _mm_min_epu8(sub, res_min1); // sub: 0 let res_arg2 = _mm_andnot_si128(_mm_cmpeq_epi8(sub, res_min2), res_arg1); *res_min.v.get_unchecked_mut(i) = res_min2; *v.get_unchecked_mut(i) = res_arg2; } Self{ v: v } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn triple_min_length(sub: &Self, a_gap: &Self, b_gap: &Self, sub_length: &Self, a_gap_length: &Self, b_gap_length: &Self, res_min: &mut Self, res_length: &mut Self) { // choose the length based on which edit is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let sub_length = *sub_length.v.get_unchecked(i); let a_gap_length = *a_gap_length.v.get_unchecked(i); let b_gap_length = *b_gap_length.v.get_unchecked(i); let res_min1 = _mm_min_epu8(a_gap, b_gap); let a_b_gt_mask = _mm_cmpeq_epi8(a_gap, res_min1); // a gap: -1, b gap: 0 let mut res_length1 = _mm_blendv_epi8(b_gap_length, a_gap_length, a_b_gt_mask); // lengths based on edits let a_b_eq_mask = _mm_cmpeq_epi8(a_gap, b_gap); // equal: -1 let a_b_max_len = _mm_max_epu8(a_gap_length, b_gap_length); res_length1 = _mm_blendv_epi8(res_length1, a_b_max_len, a_b_eq_mask); // maximize length if edits equal let res_min2 = _mm_min_epu8(sub, res_min1); let sub_gt_mask = _mm_cmpeq_epi8(sub, res_min2); // sub: -1, prev a or b gap: 0 let mut res_length2 = _mm_blendv_epi8(res_length1, sub_length, sub_gt_mask); // length based on edits let sub_eq_mask = _mm_cmpeq_epi8(sub, res_min1); let sub_max_len = _mm_max_epu8(sub_length, res_length1); res_length2 = _mm_blendv_epi8(res_length2, sub_max_len, sub_eq_mask); // maximize length if edits equal *res_min.v.get_unchecked_mut(i) = res_min2; *res_length.v.get_unchecked_mut(i) = res_length2; } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn double_min_length(new_gap: &Self, res_cont_gap: &mut Self, new_gap_length: &Self, res_cont_gap_length: &mut Self) { // choose the length based on which gap type is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..new_gap.v.len() { let new_gap = *new_gap.v.get_unchecked(i); let cont_gap = *res_cont_gap.v.get_unchecked(i); let new_gap_length = *new_gap_length.v.get_unchecked(i); let cont_gap_length = *res_cont_gap_length.v.get_unchecked(i); let res_min = _mm_min_epu8(new_gap, cont_gap); let new_cont_gt_mask = _mm_cmpeq_epi8(new_gap, res_min); // new gap: -1, continue gap: 0 let mut res_length = _mm_blendv_epi8(cont_gap_length, new_gap_length, new_cont_gt_mask); // lengths based on edits let new_cont_eq_mask = _mm_cmpeq_epi8(new_gap, cont_gap); // equal: -1 let new_cont_max_len = _mm_max_epu8(new_gap_length, cont_gap_length); res_length = _mm_blendv_epi8(res_length, new_cont_max_len, new_cont_eq_mask); // maximize length if edits equal *res_cont_gap.v.get_unchecked_mut(i) = res_min; *res_cont_gap_length.v.get_unchecked_mut(i) = res_length; } } } // this implementation will probably only be used for debugging #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl fmt::Display for $name { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse4.1")] #[inline] unsafe fn fmt_internal(s: &$name, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "[")?; let mut arr = [0u8; 16]; let arr_ptr = arr.as_mut_ptr() as *mut __m128i; for i in 0..(s.v.len() - 1) { _mm_storeu_si128(arr_ptr, *s.v.get_unchecked(i)); for j in 0..16 { write!(f, "{:>3}, ", *arr.get_unchecked(j))?; } } // leftover elements _mm_storeu_si128(arr_ptr, *s.v.get_unchecked(s.v.len() - 1)); let start = (s.v.len() - 1) << 4; for i in 0..(s.upper_bound() - start) { if i == s.upper_bound() - start - 1 { write!(f, "{:>3}", *arr.get_unchecked(i))?; }else{ write!(f, "{:>3}, ", *arr.get_unchecked(i))?; } } write!(f, "]") } unsafe { fmt_internal(self, f) } } } }; } // constant array size, so the compiler should unroll the loops create_sse_nx16x8!(Sse1x16x8, 1); create_sse_nx16x8!(Sse2x16x8, 2); create_sse_nx16x8!(Sse4x16x8, 4); create_sse_nx16x8!(Sse8x16x8, 8); create_sse_nx16x8!(Sse16x16x8, 16); /// N x 8 x 16 vector backed with 128-bit SSE vectors. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub struct SseNx8x16 { v: Vec<__m128i> } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl Jewel for SseNx8x16 { #[target_feature(enable = "sse4.1")] #[inline] unsafe fn repeating(val: u32, len: usize) -> Self { let v = vec![_mm_set1_epi16(val as i16); (len >> 3) + if (len & 7) > 0 {1} else {0}]; Self{ v: v } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn repeating_max(len: usize) -> Self { let v = vec![_mm_set1_epi16(-1i16); (len >> 3) + if (len & 7) > 0 {1} else {0}]; Self{ v: v } } #[inline] fn upper_bound(&self) -> usize { self.v.len() << 3 } #[inline] fn static_upper_bound() -> usize { unimplemented!() } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn slow_loadu(&mut self, idx: usize, ptr: *const u8, len: usize, reverse: bool) { if len == 0 { return; } let mut arr = [0u16; 8]; let arr_ptr = arr.as_mut_ptr() as *mut __m128i; let store_idx = if reverse {7} else {0}; let load_idx = if reverse {0} else {7}; for i in 0..len { let curr_idx = if reverse {idx - i} else {idx + i}; let arr_idx = curr_idx & 7; if arr_idx == store_idx || i == 0 { _mm_storeu_si128(arr_ptr, *self.v.get_unchecked(curr_idx >> 3)); } *arr.get_unchecked_mut(arr_idx) = *ptr.offset(i as isize) as u16; if arr_idx == load_idx || i == len - 1 { *self.v.get_unchecked_mut(curr_idx >> 3) = _mm_loadu_si128(arr_ptr); } } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn slow_extract(&self, i: usize) -> u32 { let idx = i >> 3; let j = i & 7; let mut arr = [0u16; 8]; _mm_storeu_si128(arr.as_mut_ptr() as *mut __m128i, *self.v.get_unchecked(idx)); *arr.get_unchecked(j) as u32 } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn slow_insert(&mut self, i: usize, val: u32) { let idx = i >> 3; let j = i & 7; let mut arr = [0u16; 8]; let arr_ptr = arr.as_mut_ptr() as *mut __m128i; _mm_storeu_si128(arr_ptr, *self.v.get_unchecked(idx)); *arr.get_unchecked_mut(j) = val as u16; *self.v.get_unchecked_mut(idx) = _mm_loadu_si128(arr_ptr); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_0(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi16(*self.v.get_unchecked(last), val as i32, 7i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_1(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi16(*self.v.get_unchecked(last), val as i32, 6i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_2(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi16(*self.v.get_unchecked(last), val as i32, 5i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_max(&mut self) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi16(*self.v.get_unchecked(last), u16::MAX as i32, 7i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_first(&mut self, val: u32) { *self.v.get_unchecked_mut(0) = _mm_insert_epi16(*self.v.get_unchecked(0), val as i32, 0i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_first_max(&mut self) { *self.v.get_unchecked_mut(0) = _mm_insert_epi16(*self.v.get_unchecked(0), u16::MAX as i32, 0i32); } operation_mut_param2!("sse4.1", add_mut, _mm_add_epi16); operation_mut_param2!("sse4.1", adds_mut, _mm_adds_epu16); operation_mut_param2!("sse4.1", and_mut, _mm_and_si128); operation_mut_param2!("sse4.1", andnot_mut, _mm_andnot_si128); operation_mut_param2!("sse4.1", cmpeq_mut, _mm_cmpeq_epi16); operation_mut_param2!("sse4.1", min_mut, _mm_min_epu16); operation_mut_param2!("sse4.1", max_mut, _mm_max_epu16); #[target_feature(enable = "sse4.1")] #[inline] unsafe fn blendv_mut(&mut self, b: &Self, mask: &Self) { for i in 0..self.v.len() { *self.v.get_unchecked_mut(i) = _mm_blendv_epi8(*self.v.get_unchecked(i), *b.v.get_unchecked(i), *mask.v.get_unchecked(i)); } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_left_1_mut(&mut self) { for i in 0..(self.v.len() - 1) { *self.v.get_unchecked_mut(i) = _mm_alignr_epi8(*self.v.get_unchecked(i + 1), *self.v.get_unchecked(i), 2i32); } // last one gets to shift in zeros let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_srli_si128(*self.v.get_unchecked(last), 2i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_left_2_mut(&mut self) { for i in 0..(self.v.len() - 1) { *self.v.get_unchecked_mut(i) = _mm_alignr_epi8(*self.v.get_unchecked(i + 1), *self.v.get_unchecked(i), 4i32); } // last one gets to shift in zeros let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_srli_si128(*self.v.get_unchecked(last), 4i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_right_1_mut(&mut self) { for i in (1..self.v.len()).rev() { *self.v.get_unchecked_mut(i) = _mm_alignr_epi8(*self.v.get_unchecked(i), *self.v.get_unchecked(i - 1), 14i32); } // first one gets to shift in zeros *self.v.get_unchecked_mut(0) = _mm_slli_si128(*self.v.get_unchecked(0), 2i32); } operation_param2!("sse4.1", add, _mm_add_epi16); operation_param2!("sse4.1", adds, _mm_adds_epu16); operation_param2!("sse4.1", andnot, _mm_andnot_si128); operation_param2!("sse4.1", cmpeq, _mm_cmpeq_epi16); operation_param2!("sse4.1", min, _mm_min_epu16); operation_param2!("sse4.1", max, _mm_max_epu16); #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_left_1(a: &Self, res: &mut Self) { for i in 0..(a.v.len() - 1) { *res.v.get_unchecked_mut(i) = _mm_alignr_epi8(*a.v.get_unchecked(i + 1), *a.v.get_unchecked(i), 2i32); } // last one gets to shift in zeros let last = a.v.len() - 1; *res.v.get_unchecked_mut(last) = _mm_srli_si128(*a.v.get_unchecked(last), 2i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_right_1(a: &Self, res: &mut Self) { for i in (1..a.v.len()).rev() { *res.v.get_unchecked_mut(i) = _mm_alignr_epi8(*a.v.get_unchecked(i), *a.v.get_unchecked(i - 1), 14i32); } // first one gets to shift in zeros *res.v.get_unchecked_mut(0) = _mm_slli_si128(*a.v.get_unchecked(0), 2i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn triple_argmin(sub: &Self, a_gap: &Self, b_gap: &Self, res_min: &mut Self) -> Self { // return the edit used in addition to doing a min operation let mut v = Vec::with_capacity(sub.v.len()); let twos = _mm_set1_epi16(2); for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let res_min1 = _mm_min_epu16(a_gap, b_gap); // a gap: 2 + -1 = 1, b gap: 2 + 0 = 2 let res_arg1 = _mm_add_epi16(twos, _mm_cmpeq_epi16(a_gap, res_min1)); let res_min2 = _mm_min_epu16(sub, res_min1); // sub: 0 let res_arg2 = _mm_andnot_si128(_mm_cmpeq_epi16(sub, res_min2), res_arg1); *res_min.v.get_unchecked_mut(i) = res_min2; v.push(res_arg2); } Self{ v: v } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn triple_min_length(sub: &Self, a_gap: &Self, b_gap: &Self, sub_length: &Self, a_gap_length: &Self, b_gap_length: &Self, res_min: &mut Self, res_length: &mut Self) { // choose the length based on which edit is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let sub_length = *sub_length.v.get_unchecked(i); let a_gap_length = *a_gap_length.v.get_unchecked(i); let b_gap_length = *b_gap_length.v.get_unchecked(i); let res_min1 = _mm_min_epu16(a_gap, b_gap); let a_b_gt_mask = _mm_cmpeq_epi16(a_gap, res_min1); // a gap: -1, b gap: 0 let mut res_length1 = _mm_blendv_epi8(b_gap_length, a_gap_length, a_b_gt_mask); // lengths based on edits let a_b_eq_mask = _mm_cmpeq_epi16(a_gap, b_gap); // equal: -1 let a_b_max_len = _mm_max_epu16(a_gap_length, b_gap_length); res_length1 = _mm_blendv_epi8(res_length1, a_b_max_len, a_b_eq_mask); // maximize length if edits equal let res_min2 = _mm_min_epu16(sub, res_min1); let sub_gt_mask = _mm_cmpeq_epi16(sub, res_min2); // sub: -1, prev a or b gap: 0 let mut res_length2 = _mm_blendv_epi8(res_length1, sub_length, sub_gt_mask); // length based on edits let sub_eq_mask = _mm_cmpeq_epi16(sub, res_min1); let sub_max_len = _mm_max_epu16(sub_length, res_length1); res_length2 = _mm_blendv_epi8(res_length2, sub_max_len, sub_eq_mask); // maximize length if edits equal *res_min.v.get_unchecked_mut(i) = res_min2; *res_length.v.get_unchecked_mut(i) = res_length2; } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn double_min_length(new_gap: &Self, res_cont_gap: &mut Self, new_gap_length: &Self, res_cont_gap_length: &mut Self) { // choose the length based on which gap type is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..new_gap.v.len() { let new_gap = *new_gap.v.get_unchecked(i); let cont_gap = *res_cont_gap.v.get_unchecked(i); let new_gap_length = *new_gap_length.v.get_unchecked(i); let cont_gap_length = *res_cont_gap_length.v.get_unchecked(i); let res_min = _mm_min_epu16(new_gap, cont_gap); let new_cont_gt_mask = _mm_cmpeq_epi16(new_gap, res_min); // new gap: -1, continue gap: 0 let mut res_length = _mm_blendv_epi8(cont_gap_length, new_gap_length, new_cont_gt_mask); // lengths based on edits let new_cont_eq_mask = _mm_cmpeq_epi16(new_gap, cont_gap); // equal: -1 let new_cont_max_len = _mm_max_epu16(new_gap_length, cont_gap_length); res_length = _mm_blendv_epi8(res_length, new_cont_max_len, new_cont_eq_mask); // maximize length if edits equal *res_cont_gap.v.get_unchecked_mut(i) = res_min; *res_cont_gap_length.v.get_unchecked_mut(i) = res_length; } } } // this implementation will probably only be used for debugging #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl fmt::Display for SseNx8x16 { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse4.1")] #[inline] unsafe fn fmt_internal(s: &SseNx8x16, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "[")?; let mut arr = [0u16; 8]; let arr_ptr = arr.as_mut_ptr() as *mut __m128i; for i in 0..(s.v.len() - 1) { _mm_storeu_si128(arr_ptr, *s.v.get_unchecked(i)); for j in 0..8 { write!(f, "{:>3}, ", *arr.get_unchecked(j))?; } } // leftover elements _mm_storeu_si128(arr_ptr, *s.v.get_unchecked(s.v.len() - 1)); let start = (s.v.len() - 1) << 3; for i in 0..(s.upper_bound() - start) { if i == s.upper_bound() - start - 1 { write!(f, "{:>3}", *arr.get_unchecked(i))?; }else{ write!(f, "{:>3}, ", *arr.get_unchecked(i))?; } } write!(f, "]") } unsafe { fmt_internal(self, f) } } } /// N x 4 x 32 vector backed with 128-bit SSE vectors. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub struct SseNx4x32 { v: Vec<__m128i> } /// Workaround for the lack of the _mm_adds_epu32 intrinsic. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse4.1")] #[inline] unsafe fn _mm_adds_epu32(a: __m128i, b: __m128i) -> __m128i { let sum = _mm_add_epi32(a, b); let min = _mm_min_epu32(a, sum); let eq = _mm_cmpeq_epi32(a, min); // if the sum is less than a, then saturate // note: sum is either greater than both a and b or less than both _mm_blendv_epi8(_mm_set1_epi32(-1i32), sum, eq) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl Jewel for SseNx4x32 { #[target_feature(enable = "sse4.1")] #[inline] unsafe fn repeating(val: u32, len: usize) -> Self { let v = vec![_mm_set1_epi32(val as i32); (len >> 2) + if (len & 3) > 0 {1} else {0}]; Self{ v: v } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn repeating_max(len: usize) -> Self { let v = vec![_mm_set1_epi32(-1i32); (len >> 2) + if (len & 3) > 0 {1} else {0}]; Self{ v: v } } #[inline] fn upper_bound(&self) -> usize { self.v.len() << 2 } #[inline] fn static_upper_bound() -> usize { unimplemented!() } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn slow_loadu(&mut self, idx: usize, ptr: *const u8, len: usize, reverse: bool) { if len == 0 { return; } let mut arr = [0u32; 4]; let arr_ptr = arr.as_mut_ptr() as *mut __m128i; let store_idx = if reverse {3} else {0}; let load_idx = if reverse {0} else {3}; for i in 0..len { let curr_idx = if reverse {idx - i} else {idx + i}; let arr_idx = curr_idx & 3; if arr_idx == store_idx || i == 0 { _mm_storeu_si128(arr_ptr, *self.v.get_unchecked(curr_idx >> 2)); } *arr.get_unchecked_mut(arr_idx) = *ptr.offset(i as isize) as u32; if arr_idx == load_idx || i == len - 1 { *self.v.get_unchecked_mut(curr_idx >> 2) = _mm_loadu_si128(arr_ptr); } } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn slow_extract(&self, i: usize) -> u32 { let idx = i >> 2; let j = i & 3; let mut arr = [0u32; 4]; _mm_storeu_si128(arr.as_mut_ptr() as *mut __m128i, *self.v.get_unchecked(idx)); *arr.get_unchecked(j) } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn slow_insert(&mut self, i: usize, val: u32) { let idx = i >> 2; let j = i & 3; let mut arr = [0u32; 4]; let arr_ptr = arr.as_mut_ptr() as *mut __m128i; _mm_storeu_si128(arr_ptr, *self.v.get_unchecked(idx)); *arr.get_unchecked_mut(j) = val; *self.v.get_unchecked_mut(idx) = _mm_loadu_si128(arr_ptr); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_0(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi32(*self.v.get_unchecked(last), val as i32, 3i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_1(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi32(*self.v.get_unchecked(last), val as i32, 2i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_2(&mut self, val: u32) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi32(*self.v.get_unchecked(last), val as i32, 1i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_last_max(&mut self) { let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_insert_epi32(*self.v.get_unchecked(last), -1i32, 3i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_first(&mut self, val: u32) { *self.v.get_unchecked_mut(0) = _mm_insert_epi32(*self.v.get_unchecked(0), val as i32, 0i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn insert_first_max(&mut self) { *self.v.get_unchecked_mut(0) = _mm_insert_epi32(*self.v.get_unchecked(0), -1i32, 0i32); } operation_mut_param2!("sse4.1", add_mut, _mm_add_epi32); operation_mut_param2!("sse4.1", adds_mut, _mm_adds_epu32); operation_mut_param2!("sse4.1", and_mut, _mm_and_si128); operation_mut_param2!("sse4.1", andnot_mut, _mm_andnot_si128); operation_mut_param2!("sse4.1", cmpeq_mut, _mm_cmpeq_epi32); operation_mut_param2!("sse4.1", min_mut, _mm_min_epu32); operation_mut_param2!("sse4.1", max_mut, _mm_max_epu32); #[target_feature(enable = "sse4.1")] #[inline] unsafe fn blendv_mut(&mut self, b: &Self, mask: &Self) { for i in 0..self.v.len() { *self.v.get_unchecked_mut(i) = _mm_blendv_epi8(*self.v.get_unchecked(i), *b.v.get_unchecked(i), *mask.v.get_unchecked(i)); } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_left_1_mut(&mut self) { for i in 0..(self.v.len() - 1) { *self.v.get_unchecked_mut(i) = _mm_alignr_epi8(*self.v.get_unchecked(i + 1), *self.v.get_unchecked(i), 4i32); } // last one gets to shift in zeros let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_srli_si128(*self.v.get_unchecked(last), 4i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_left_2_mut(&mut self) { for i in 0..(self.v.len() - 1) { *self.v.get_unchecked_mut(i) = _mm_alignr_epi8(*self.v.get_unchecked(i + 1), *self.v.get_unchecked(i), 8i32); } // last one gets to shift in zeros let last = self.v.len() - 1; *self.v.get_unchecked_mut(last) = _mm_srli_si128(*self.v.get_unchecked(last), 8i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_right_1_mut(&mut self) { for i in (1..self.v.len()).rev() { *self.v.get_unchecked_mut(i) = _mm_alignr_epi8(*self.v.get_unchecked(i), *self.v.get_unchecked(i - 1), 12i32); } // first one gets to shift in zeros *self.v.get_unchecked_mut(0) = _mm_slli_si128(*self.v.get_unchecked(0), 4i32); } operation_param2!("sse4.1", add, _mm_add_epi32); operation_param2!("sse4.1", adds, _mm_adds_epu32); operation_param2!("sse4.1", andnot, _mm_andnot_si128); operation_param2!("sse4.1", cmpeq, _mm_cmpeq_epi32); operation_param2!("sse4.1", min, _mm_min_epu32); operation_param2!("sse4.1", max, _mm_max_epu32); #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_left_1(a: &Self, res: &mut Self) { for i in 0..(a.v.len() - 1) { *res.v.get_unchecked_mut(i) = _mm_alignr_epi8(*a.v.get_unchecked(i + 1), *a.v.get_unchecked(i), 4i32); } // last one gets to shift in zeros let last = a.v.len() - 1; *res.v.get_unchecked_mut(last) = _mm_srli_si128(*a.v.get_unchecked(last), 4i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn shift_right_1(a: &Self, res: &mut Self) { for i in (1..a.v.len()).rev() { *res.v.get_unchecked_mut(i) = _mm_alignr_epi8(*a.v.get_unchecked(i), *a.v.get_unchecked(i - 1), 12i32); } // first one gets to shift in zeros *res.v.get_unchecked_mut(0) = _mm_slli_si128(*a.v.get_unchecked(0), 4i32); } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn triple_argmin(sub: &Self, a_gap: &Self, b_gap: &Self, res_min: &mut Self) -> Self { // return the edit used in addition to doing a min operation let mut v = Vec::with_capacity(sub.v.len()); let twos = _mm_set1_epi32(2); for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let res_min1 = _mm_min_epu32(a_gap, b_gap); // a gap: 2 + -1 = 1, b gap: 2 + 0 = 2 let res_arg1 = _mm_add_epi32(twos, _mm_cmpeq_epi32(a_gap, res_min1)); let res_min2 = _mm_min_epu32(sub, res_min1); // sub: 0 let res_arg2 = _mm_andnot_si128(_mm_cmpeq_epi32(sub, res_min2), res_arg1); *res_min.v.get_unchecked_mut(i) = res_min2; v.push(res_arg2); } Self{ v: v } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn triple_min_length(sub: &Self, a_gap: &Self, b_gap: &Self, sub_length: &Self, a_gap_length: &Self, b_gap_length: &Self, res_min: &mut Self, res_length: &mut Self) { // choose the length based on which edit is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..sub.v.len() { let sub = *sub.v.get_unchecked(i); let a_gap = *a_gap.v.get_unchecked(i); let b_gap = *b_gap.v.get_unchecked(i); let sub_length = *sub_length.v.get_unchecked(i); let a_gap_length = *a_gap_length.v.get_unchecked(i); let b_gap_length = *b_gap_length.v.get_unchecked(i); let res_min1 = _mm_min_epu32(a_gap, b_gap); let a_b_gt_mask = _mm_cmpeq_epi32(a_gap, res_min1); // a gap: -1, b gap: 0 let mut res_length1 = _mm_blendv_epi8(b_gap_length, a_gap_length, a_b_gt_mask); // lengths based on edits let a_b_eq_mask = _mm_cmpeq_epi32(a_gap, b_gap); // equal: -1 let a_b_max_len = _mm_max_epu32(a_gap_length, b_gap_length); res_length1 = _mm_blendv_epi8(res_length1, a_b_max_len, a_b_eq_mask); // maximize length if edits equal let res_min2 = _mm_min_epu32(sub, res_min1); let sub_gt_mask = _mm_cmpeq_epi32(sub, res_min2); // sub: -1, prev a or b gap: 0 let mut res_length2 = _mm_blendv_epi8(res_length1, sub_length, sub_gt_mask); // length based on edits let sub_eq_mask = _mm_cmpeq_epi32(sub, res_min1); let sub_max_len = _mm_max_epu32(sub_length, res_length1); res_length2 = _mm_blendv_epi8(res_length2, sub_max_len, sub_eq_mask); // maximize length if edits equal *res_min.v.get_unchecked_mut(i) = res_min2; *res_length.v.get_unchecked_mut(i) = res_length2; } } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn double_min_length(new_gap: &Self, res_cont_gap: &mut Self, new_gap_length: &Self, res_cont_gap_length: &mut Self) { // choose the length based on which gap type is chosen during the min operation // secondary objective of maximizing length if edit costs equal for i in 0..new_gap.v.len() { let new_gap = *new_gap.v.get_unchecked(i); let cont_gap = *res_cont_gap.v.get_unchecked(i); let new_gap_length = *new_gap_length.v.get_unchecked(i); let cont_gap_length = *res_cont_gap_length.v.get_unchecked(i); let res_min = _mm_min_epu32(new_gap, cont_gap); let new_cont_gt_mask = _mm_cmpeq_epi32(new_gap, res_min); // new gap: -1, continue gap: 0 let mut res_length = _mm_blendv_epi8(cont_gap_length, new_gap_length, new_cont_gt_mask); // lengths based on edits let new_cont_eq_mask = _mm_cmpeq_epi32(new_gap, cont_gap); // equal: -1 let new_cont_max_len = _mm_max_epu32(new_gap_length, cont_gap_length); res_length = _mm_blendv_epi8(res_length, new_cont_max_len, new_cont_eq_mask); // maximize length if edits equal *res_cont_gap.v.get_unchecked_mut(i) = res_min; *res_cont_gap_length.v.get_unchecked_mut(i) = res_length; } } } // this implementation will probably only be used for debugging #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl fmt::Display for SseNx4x32 { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse4.1")] #[inline] unsafe fn fmt_internal(s: &SseNx4x32, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "[")?; let mut arr = [0u32; 4]; let arr_ptr = arr.as_mut_ptr() as *mut __m128i; for i in 0..(s.v.len() - 1) { _mm_storeu_si128(arr_ptr, *s.v.get_unchecked(i)); for j in 0..4 { write!(f, "{:>3}, ", *arr.get_unchecked(j))?; } } // leftover elements _mm_storeu_si128(arr_ptr, *s.v.get_unchecked(s.v.len() - 1)); let start = (s.v.len() - 1) << 2; for i in 0..(s.upper_bound() - start) { if i == s.upper_bound() - start - 1 { write!(f, "{:>3}", *arr.get_unchecked(i))?; }else{ write!(f, "{:>3}, ", *arr.get_unchecked(i))?; } } write!(f, "]") } unsafe { fmt_internal(self, f) } } } pub trait HammingJewel { unsafe fn loadu(ptr: *const u8, len: usize) -> Self; fn upper_bound(&self) -> usize; unsafe fn mm_count_mismatches(a_ptr: *const u8, b_ptr: *const u8, len: usize) -> u32; unsafe fn count_mismatches(a_ptr: *const u8, b_ptr: *const u8, len: usize) -> u32; unsafe fn vector_count_mismatches(a: &Self, b_ptr: *const u8, len: usize) -> u32; } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub struct Avx { v: Vec<__m256i> } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl HammingJewel for Avx { #[target_feature(enable = "avx2")] #[inline] unsafe fn loadu(ptr: *const u8, len: usize) -> Self { let word_len = len >> 5; let word_rem = len & 31; let mut v = Vec::with_capacity(word_len + if word_rem > 0 {1} else {0}); let avx2_ptr = ptr as *const __m256i; for i in 0..word_len { v.push(_mm256_loadu_si256(avx2_ptr.offset(i as isize))); } if word_rem > 0 { let mut arr = [0u8; 32]; let end_ptr = ptr.offset((word_len << 5) as isize); for i in 0..word_rem { *arr.get_unchecked_mut(i) = *end_ptr.offset(i as isize); } v.push(_mm256_loadu_si256(arr.as_ptr() as *const __m256i)); } Self{ v: v } } #[inline] fn upper_bound(&self) -> usize { self.v.len() << 5 } #[target_feature(enable = "avx2")] #[inline] unsafe fn mm_count_mismatches(a_ptr: *const u8, b_ptr: *const u8, len: usize) -> u32 { let mut res = 0u32; let div_len = (len >> 5) as isize; let avx2_a_ptr = a_ptr as *const __m256i; let avx2_b_ptr = b_ptr as *const __m256i; for i in 0..div_len { let a = _mm256_loadu_si256(avx2_a_ptr.offset(i)); let b = _mm256_loadu_si256(avx2_b_ptr.offset(i)); let eq = _mm256_cmpeq_epi8(a, b); // basic movemask count equal bytes res += _mm256_movemask_epi8(eq).count_ones(); } for i in (div_len << 5)..len as isize { res += (*a_ptr.offset(i) == *b_ptr.offset(i)) as u32; } len as u32 - res } #[target_feature(enable = "avx2")] #[inline] unsafe fn count_mismatches(a_ptr: *const u8, b_ptr: *const u8, len: usize) -> u32 { let refresh_len = (len / (255 * 32)) as isize; let zeros = _mm256_setzero_si256(); let mut sad = zeros; let avx2_a_ptr = a_ptr as *const __m256i; let avx2_b_ptr = b_ptr as *const __m256i; for i in 0..refresh_len { let mut curr = zeros; for j in (i * 255)..((i + 1) * 255) { let a = _mm256_loadu_si256(avx2_a_ptr.offset(j)); let b = _mm256_loadu_si256(avx2_b_ptr.offset(j)); let eq = _mm256_cmpeq_epi8(a, b); curr = _mm256_sub_epi8(curr, eq); // subtract -1 = add 1 when matching // counting matches instead of mismatches for speed } // subtract 0 and sum up 8 bytes at once horizontally into four 64 bit ints // accumulate those 64 bit ints sad = _mm256_add_epi64(sad, _mm256_sad_epu8(curr, zeros)); } let word_len = (len >> 5) as isize; let mut curr = zeros; // leftover blocks of 32 bytes for i in (refresh_len * 255)..word_len { let a = _mm256_loadu_si256(avx2_a_ptr.offset(i)); let b = _mm256_loadu_si256(avx2_b_ptr.offset(i)); let eq = _mm256_cmpeq_epi8(a, b); curr = _mm256_sub_epi8(curr, eq); // subtract -1 = add 1 when matching } sad = _mm256_add_epi64(sad, _mm256_sad_epu8(curr, zeros)); let mut sad_arr = [0u32; 8]; _mm256_storeu_si256(sad_arr.as_mut_ptr() as *mut __m256i, sad); let mut res = *sad_arr.get_unchecked(0) + *sad_arr.get_unchecked(2) + *sad_arr.get_unchecked(4) + *sad_arr.get_unchecked(6); for i in (word_len << 5)..len as isize { res += (*a_ptr.offset(i) == *b_ptr.offset(i)) as u32; } len as u32 - res } #[target_feature(enable = "avx2")] #[inline] unsafe fn vector_count_mismatches(a: &Self, b_ptr: *const u8, len: usize) -> u32 { let refresh_len = (a.v.len() / 255) as isize; let zeros = _mm256_setzero_si256(); let mut sad = zeros; let avx2_b_ptr = b_ptr as *const __m256i; for i in 0..refresh_len { let mut curr = zeros; for j in (i * 255)..((i + 1) * 255) { let a = *a.v.get_unchecked(j as usize); let b = _mm256_loadu_si256(avx2_b_ptr.offset(j)); let eq = _mm256_cmpeq_epi8(a, b); curr = _mm256_sub_epi8(curr, eq); // subtract -1 = add 1 when matching // counting matches instead of mismatches for speed } // subtract 0 and sum up 8 bytes at once horizontally into four 64 bit ints // accumulate those 64 bit ints sad = _mm256_add_epi64(sad, _mm256_sad_epu8(curr, zeros)); } let mut curr = zeros; // leftover blocks of 32 bytes for i in (refresh_len * 255)..a.v.len() as isize { let a = *a.v.get_unchecked(i as usize); let b = _mm256_loadu_si256(avx2_b_ptr.offset(i)); let eq = _mm256_cmpeq_epi8(a, b); curr = _mm256_sub_epi8(curr, eq); // subtract -1 = add 1 when matching } sad = _mm256_add_epi64(sad, _mm256_sad_epu8(curr, zeros)); let mut sad_arr = [0u32; 8]; _mm256_storeu_si256(sad_arr.as_mut_ptr() as *mut __m256i, sad); let res = *sad_arr.get_unchecked(0) + *sad_arr.get_unchecked(2) + *sad_arr.get_unchecked(4) + *sad_arr.get_unchecked(6); len as u32 - res } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub struct Sse { v: Vec<__m128i> } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] impl HammingJewel for Sse { #[target_feature(enable = "sse4.1")] #[inline] unsafe fn loadu(ptr: *const u8, len: usize) -> Self { let word_len = len >> 4; let word_rem = len & 15; let mut v = Vec::with_capacity(word_len + if word_rem > 0 {1} else {0}); let sse_ptr = ptr as *const __m128i; for i in 0..word_len { v.push(_mm_loadu_si128(sse_ptr.offset(i as isize))); } if word_rem > 0 { let mut arr = [0u8; 16]; let end_ptr = ptr.offset((word_len << 4) as isize); for i in 0..word_rem { *arr.get_unchecked_mut(i) = *end_ptr.offset(i as isize); } v.push(_mm_loadu_si128(arr.as_ptr() as *const __m128i)); } Self{ v: v } } #[inline] fn upper_bound(&self) -> usize { self.v.len() << 4 } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn mm_count_mismatches(a_ptr: *const u8, b_ptr: *const u8, len: usize) -> u32 { let mut res = 0u32; let div_len = (len >> 4) as isize; let sse_a_ptr = a_ptr as *const __m128i; let sse_b_ptr = b_ptr as *const __m128i; for i in 0..div_len { let a = _mm_loadu_si128(sse_a_ptr.offset(i)); let b = _mm_loadu_si128(sse_b_ptr.offset(i)); let eq = _mm_cmpeq_epi8(a, b); // basic movemask count equal bytes res += _mm_movemask_epi8(eq).count_ones(); } for i in (div_len << 4)..len as isize { res += (*a_ptr.offset(i) == *b_ptr.offset(i)) as u32; } len as u32 - res } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn count_mismatches(a_ptr: *const u8, b_ptr: *const u8, len: usize) -> u32 { let refresh_len = (len / (255 * 16)) as isize; let zeros = _mm_setzero_si128(); let mut sad = zeros; let sse_a_ptr = a_ptr as *const __m128i; let sse_b_ptr = b_ptr as *const __m128i; for i in 0..refresh_len { let mut curr = zeros; for j in (i * 255)..((i + 1) * 255) { let a = _mm_loadu_si128(sse_a_ptr.offset(j)); let b = _mm_loadu_si128(sse_b_ptr.offset(j)); let eq = _mm_cmpeq_epi8(a, b); curr = _mm_sub_epi8(curr, eq); // subtract -1 = add 1 when matching // counting matches instead of mismatches for speed } // subtract 0 and sum up 8 bytes at once horizontally into two 64 bit ints // accumulate those 64 bit ints sad = _mm_add_epi64(sad, _mm_sad_epu8(curr, zeros)); } let word_len = (len >> 4) as isize; let mut curr = zeros; // leftover blocks of 16 bytes for i in (refresh_len * 255)..word_len { let a = _mm_loadu_si128(sse_a_ptr.offset(i)); let b = _mm_loadu_si128(sse_b_ptr.offset(i)); let eq = _mm_cmpeq_epi8(a, b); curr = _mm_sub_epi8(curr, eq); // subtract -1 = add 1 when matching } sad = _mm_add_epi64(sad, _mm_sad_epu8(curr, zeros)); let mut sad_arr = [0u32; 4]; _mm_storeu_si128(sad_arr.as_mut_ptr() as *mut __m128i, sad); let mut res = *sad_arr.get_unchecked(0) + *sad_arr.get_unchecked(2); for i in (word_len << 4)..len as isize { res += (*a_ptr.offset(i) == *b_ptr.offset(i)) as u32; } len as u32 - res } #[target_feature(enable = "sse4.1")] #[inline] unsafe fn vector_count_mismatches(a: &Self, b_ptr: *const u8, len: usize) -> u32 { let refresh_len = (a.v.len() / 255) as isize; let zeros = _mm_setzero_si128(); let mut sad = zeros; let sse_b_ptr = b_ptr as *const __m128i; for i in 0..refresh_len { let mut curr = zeros; for j in (i * 255)..((i + 1) * 255) { let a = *a.v.get_unchecked(j as usize); let b = _mm_loadu_si128(sse_b_ptr.offset(j)); let eq = _mm_cmpeq_epi8(a, b); curr = _mm_sub_epi8(curr, eq); // subtract -1 = add 1 when matching // counting matches instead of mismatches for speed } // subtract 0 and sum up 8 bytes at once horizontally into two 64 bit ints // accumulate those 64 bit ints sad = _mm_add_epi64(sad, _mm_sad_epu8(curr, zeros)); } let mut curr = zeros; // leftover blocks of 16 bytes for i in (refresh_len * 255)..a.v.len() as isize { let a = *a.v.get_unchecked(i as usize); let b = _mm_loadu_si128(sse_b_ptr.offset(i)); let eq = _mm_cmpeq_epi8(a, b); curr = _mm_sub_epi8(curr, eq); // subtract -1 = add 1 when matching } sad = _mm_add_epi64(sad, _mm_sad_epu8(curr, zeros)); let mut sad_arr = [0u32; 4]; _mm_storeu_si128(sad_arr.as_mut_ptr() as *mut __m128i, sad); let res = *sad_arr.get_unchecked(0) + *sad_arr.get_unchecked(2); len as u32 - res } } triple_accel-0.4.0/src/levenshtein.rs000064400000000000000000002377340000000000000157310ustar 00000000000000//! This module provides many Levenshtein distance routines. //! //! These distance functions share the same efficient underlying SIMD-accelerated implementation: //! * `levenshtein_exp` for low number of edits, otherwise `levenshtein` //! * `rdamerau_exp` for low number of edits, otherwise `rdamerau` //! * `levenshtein_simd_k` //! * `levenshtein_simd_k_with_opts` //! //! These search functions share the same efficient underlying SIMD-accelerated implementation: //! * `levenshtein_search` //! * `levenshtein_search_simd` //! * `levenshtein_search_simd_with_opts` use std::*; use super::*; use super::jewel::*; /// A struct holding the edit costs for mismatches, gaps, and possibly transpositions. /// /// This should be used as a parameter for Levenshtein distance or search routines. #[derive(Copy, Clone, Debug)] pub struct EditCosts { mismatch_cost: u8, gap_cost: u8, start_gap_cost: u8, transpose_cost: Option } impl EditCosts { /// Create a new `EditCosts` struct, checking for whether the specified costs are valid. /// /// # Arguments /// * `mismatch_cost` - cost of a mismatch edit, which must be positive /// * `gap_cost` - cost of a gap, which must be positive /// * `start_gap_cost` - additional cost of starting a gap, for affine gap costs; this can /// be zero for linear gap costs /// * `transpose_cost` - cost of a transpose, which must be cheaper than doing the equivalent /// operation with mismatches and gaps pub fn new(mismatch_cost: u8, gap_cost: u8, start_gap_cost: u8, transpose_cost: Option) -> Self { assert!(mismatch_cost > 0); assert!(gap_cost > 0); if let Some(cost) = transpose_cost { assert!(cost > 0); // transpose cost must be cheaper than doing the equivalent with other edits assert!((cost >> 1) < mismatch_cost); assert!((cost >> 1) < gap_cost); } Self{ mismatch_cost: mismatch_cost, gap_cost: gap_cost, start_gap_cost: start_gap_cost, transpose_cost: transpose_cost } } /// For Levenshtein searches, the cost of transpositions must be less than or equal to cost of /// gaps. /// /// This is important for free gaps at the beginning of the needle to be unable to take priority /// over transpositions, as it is possible to emulate a transposition with two gaps. fn check_search(&self) { if let Some(cost) = self.transpose_cost { assert!(cost <= self.start_gap_cost + self.gap_cost); } } } /// Costs for Levenshtein distance, where mismatches and gaps both have a cost of 1, and /// transpositions are not allowed. pub const LEVENSHTEIN_COSTS: EditCosts = EditCosts{mismatch_cost: 1, gap_cost: 1, start_gap_cost: 0, transpose_cost: None}; /// Costs for restricted Damerau-Levenshtein distance, where mismatches, gaps, and transpositions /// all have a cost of 1. pub const RDAMERAU_COSTS: EditCosts = EditCosts{mismatch_cost: 1, gap_cost: 1, start_gap_cost: 0, transpose_cost: Some(1)}; /// Returns the Levenshtein distance between two strings using the naive scalar algorithm. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let dist = levenshtein_naive(b"abc", b"ab"); /// /// assert!(dist == 1); /// ``` pub fn levenshtein_naive(a: &[T], b: &[T]) -> u32 { levenshtein_naive_with_opts(a, b, false, LEVENSHTEIN_COSTS).0 } /// Returns the Levenshtein distance between two strings using the naive scalar algorithm. /// /// # Arguments /// * `a` - first string (&str) /// * `b` - second string (&str) /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let dist = levenstein_naive_str("abc", "ab"); /// /// assert!(dist == 1); /// ``` pub fn levenstein_naive_str(a: &str, b: &str) -> u32 { let a: Vec = a.chars().collect(); let b: Vec = b.chars().collect(); levenshtein_naive(&a, &b) } /// Returns the Levenshtein distance between two strings and optionally, the edit traceback, /// using the naive scalar algorithm, with extra options. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// * `trace_on` - whether to return the traceback, the sequence of edits between `a` and `b` /// * `costs` - `EditCosts` struct for the cost of each edit operation /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let dist = levenshtein_naive_with_opts(b"abc", b"ab", true, LEVENSHTEIN_COSTS); /// /// assert!(dist == (1, Some(vec![Edit{edit: EditType::Match, count: 2}, /// Edit{edit: EditType::BGap, count: 1}]))); /// ``` #[inline] pub fn levenshtein_naive_with_opts(a: &[T], b: &[T], trace_on: bool, costs: EditCosts) -> (u32, Option>) where T: PartialEq { let swap = a.len() > b.len(); // swap so that a len <= b len let a_new = if swap {b} else {a}; let a_new_len = a_new.len(); let b_new = if swap {a} else {b}; let b_new_len = b_new.len(); let mismatch_cost = costs.mismatch_cost as u32; let gap_cost = costs.gap_cost as u32; let start_gap_cost = costs.start_gap_cost as u32; let transpose_cost = match costs.transpose_cost { Some(cost) => cost as u32, None => 0 }; let allow_transpose = costs.transpose_cost.is_some(); let len = a_new_len + 1; let mut dp0 = vec![0u32; len]; let mut dp1 = vec![0u32; len]; // in each iteration, dp0 and dp1 are already calculated let mut dp2 = vec![0u32; len]; // dp2 the currently calculated column let mut a_gap_dp = vec![u32::MAX; len]; let mut b_gap_dp = vec![u32::MAX; len]; let mut traceback = if trace_on {vec![0u8; (b_new_len + 1) * len]} else {vec![]}; for i in 0..len { dp1[i] = (i as u32) * gap_cost + if i == 0 {0} else {start_gap_cost}; if trace_on { traceback[0 * len + i] = 2u8; } } for i in 1..(b_new_len + 1) { a_gap_dp[0] = (i as u32) * gap_cost + start_gap_cost; dp2[0] = (i as u32) * gap_cost + start_gap_cost; if trace_on { traceback[i * len + 0] = 1u8; } for j in 1..len { let sub = dp1[j - 1] + ((a_new[j - 1] != b_new[i - 1]) as u32) * mismatch_cost; a_gap_dp[j] = cmp::min(dp1[j] + start_gap_cost + gap_cost, a_gap_dp[j].saturating_add(gap_cost)); b_gap_dp[j] = cmp::min(dp2[j - 1] + start_gap_cost + gap_cost, b_gap_dp[j - 1].saturating_add(gap_cost)); let traceback_idx = i * len + j; dp2[j] = a_gap_dp[j]; if trace_on { traceback[traceback_idx] = 1u8; } if b_gap_dp[j] < dp2[j] { dp2[j] = b_gap_dp[j]; if trace_on { traceback[traceback_idx] = 2u8; } } if sub <= dp2[j] { dp2[j] = sub; if trace_on { traceback[traceback_idx] = 0u8; } } if allow_transpose && i > 1 && j > 1 && a_new[j - 1] == b_new[i - 2] && a_new[j - 2] == b_new[i - 1] { let transpose = dp0[j - 2] + transpose_cost; if transpose <= dp2[j] { dp2[j] = transpose; if trace_on { traceback[traceback_idx] = 3u8; } } } } mem::swap(&mut dp0, &mut dp1); mem::swap(&mut dp1, &mut dp2); } if trace_on { // estimate an upper bound for the number of Edits let mut upper_bound_edits = dp1[a_new_len] / cmp::min(mismatch_cost, gap_cost); if allow_transpose { upper_bound_edits = cmp::max(upper_bound_edits, (dp1[a_new_len] >> 1) / transpose_cost + 1); } let mut res: Vec = Vec::with_capacity(((upper_bound_edits << 1) + 1) as usize); let mut i = b_new_len; let mut j = a_new_len; while i > 0 || j > 0 { let edit = traceback[i * len + j]; let e = match edit { 0 => { i -= 1; j -= 1; if a_new[j] == b_new[i] {EditType::Match} else {EditType::Mismatch} }, 1 => { i -= 1; if swap {EditType::BGap} else {EditType::AGap} }, 2 => { j -= 1; if swap {EditType::AGap} else {EditType::BGap} }, 3 => { i -= 2; j -= 2; EditType::Transpose }, _ => unreachable!() }; if res.len() > 0 && res.last().unwrap().edit == e { res.last_mut().unwrap().count += 1; }else{ res.push(Edit{edit: e, count: 1}); } } res.reverse(); (dp1[a_new_len], Some(res)) }else{ (dp1[a_new_len], None) } } /// Returns the Levenshtein distance, bounded by a cost threshold `k`, between two strings, using the /// naive scalar algorithm. /// /// This will return `None` if the Levenshtein distance between `a` and `b` is greater than the /// threshold `k`. /// This should be much faster than `levenshtein_naive` if `k` is small compared to the lengths of /// `a` and `b`. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// * `k` - maximum number of edits allowed between `a` and `b` /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let dist = levenshtein_naive_k(b"abc", b"ab", 1); /// /// assert!(dist.unwrap() == 1); /// ``` pub fn levenshtein_naive_k(a: &[u8], b: &[u8], k: u32) -> Option { let res = levenshtein_naive_k_with_opts(a, b, k, false, LEVENSHTEIN_COSTS); match res { Some((edits, _)) => Some(edits), None => None } } /// Returns the Levenshtein distance, bounded by a cost threshold `k`, between two strings and optionally, /// the edit traceback, using the naive scalar algorithm, with extra options. /// /// This will return `None` if the Levenshtein distance between `a` and `b` is greater than the /// threshold `k`. /// This should be much faster than `levenshtein_naive_with_opts` if `k` is small compared to the lengths of /// `a` and `b`. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// * `k` - maximum number of cost allowed between `a` and `b` /// * `trace_on` - whether to return the traceback, the sequence of edits between `a` and `b` /// * `costs` - `EditCosts` struct for the cost of each edit operation /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let dist = levenshtein_naive_k_with_opts(b"abc", b"ab", 1, true, LEVENSHTEIN_COSTS); /// /// assert!(dist.unwrap() == (1, Some(vec![Edit{edit: EditType::Match, count: 2}, /// Edit{edit: EditType::BGap, count: 1}]))); /// ``` #[inline] pub fn levenshtein_naive_k_with_opts(a: &[T], b: &[T], k: u32, trace_on: bool, costs: EditCosts) -> Option<(u32, Option>)> where T: PartialEq { let swap = a.len() > b.len(); // swap so that a len <= b len let a_new = if swap {b} else {a}; let a_new_len = a_new.len(); let b_new = if swap {a} else {b}; let b_new_len = b_new.len(); let mismatch_cost = costs.mismatch_cost as u32; let gap_cost = costs.gap_cost as u32; let start_gap_cost = costs.start_gap_cost as u32; let transpose_cost = match costs.transpose_cost { Some(cost) => cost as u32, None => 0 }; let allow_transpose = costs.transpose_cost.is_some(); // upper bound on the number of edits, in case k is too large let max_k = cmp::min((a_new_len as u32) * mismatch_cost, ((a_new_len as u32) << 1) * gap_cost + if a_new_len == 0 {0} else {start_gap_cost + if b_new_len == a_new_len {start_gap_cost} else {0}}); let max_k = cmp::min(k, max_k + ((b_new_len - a_new_len) as u32) * gap_cost + if b_new_len == a_new_len {0} else {start_gap_cost}); // farthest we can stray from the main diagonal // have to start at least one gap let unit_k = (max_k.saturating_sub(start_gap_cost) / gap_cost) as usize; if b_new_len - a_new_len > unit_k { return None; } let len = a_new_len + 1; let mut lo = 0usize; let mut hi = cmp::min(unit_k + 1, b_new_len + 1); let mut prev_lo0; let mut prev_lo1 = 0; // unused value let mut prev_hi; let k_len = cmp::min((unit_k << 1) + 1, b_new_len + 1); let mut dp0 = vec![0u32; k_len]; let mut dp1 = vec![0u32; k_len]; // in each iteration, dp0 and dp1 are already calculated let mut dp2 = vec![0u32; k_len]; // dp2 the currently calculated row let mut a_gap_dp = vec![u32::MAX; k_len]; let mut b_gap_dp = vec![u32::MAX; k_len]; let mut traceback = if trace_on {vec![0u8; len * k_len]} else {vec![]}; for i in 0..(hi - lo) { dp1[i] = (i as u32) * gap_cost + if i == 0 {0} else {start_gap_cost}; if trace_on { traceback[0 * k_len + i] = 1u8; } } for i in 1..len { // keep track of prev_lo for the offset of previous rows prev_lo0 = prev_lo1; prev_lo1 = lo; prev_hi = hi; hi = cmp::min(hi + 1, b_new_len + 1); if i > unit_k { lo += 1; } for j in 0..(hi - lo) { let idx = lo + j; let sub = if idx == 0 { u32::MAX }else{ dp1[idx - 1 - prev_lo1] + ((a_new[i - 1] != b_new[idx - 1]) as u32) * mismatch_cost }; a_gap_dp[j] = if j == 0 { u32::MAX }else{ cmp::min(dp2[j - 1] + start_gap_cost + gap_cost, a_gap_dp[j - 1].saturating_add(gap_cost)) }; b_gap_dp[j] = if idx >= prev_hi { u32::MAX }else{ cmp::min(dp1[idx - prev_lo1] + start_gap_cost + gap_cost, b_gap_dp[idx - prev_lo1].saturating_add(gap_cost)) }; dp2[j] = sub; let traceback_idx = i * k_len + j; if trace_on { traceback[traceback_idx] = 0u8; } if a_gap_dp[j] < dp2[j] { dp2[j] = a_gap_dp[j]; if trace_on { traceback[traceback_idx] = 1u8; } } if b_gap_dp[j] < dp2[j] { dp2[j] = b_gap_dp[j]; if trace_on { traceback[traceback_idx] = 2u8; } } if allow_transpose && i > 1 && idx > 1 && a_new[i - 1] == b_new[idx - 2] && a_new[i - 2] == b_new[idx - 1] { let transpose = dp0[idx - prev_lo0 - 2] + transpose_cost; if transpose <= dp2[j] { dp2[j] = transpose; if trace_on { traceback[traceback_idx] = 3u8; } } } } mem::swap(&mut dp0, &mut dp1); mem::swap(&mut dp1, &mut dp2); } if dp1[hi - lo - 1] > max_k { return None; } if !trace_on { return Some((dp1[hi - lo - 1], None)); } // estimate an upper bound for the number of Edits let mut upper_bound_edits = dp1[hi - lo - 1] / cmp::min(mismatch_cost, gap_cost); if allow_transpose { upper_bound_edits = cmp::max(upper_bound_edits, (dp1[hi - lo - 1] >> 1) / transpose_cost + 1); } let mut res: Vec = Vec::with_capacity(((upper_bound_edits << 1) + 1) as usize); let mut i = a_new_len; let mut j = b_new_len; while i > 0 || j > 0 { let edit = traceback[i * k_len + (j - (if i > unit_k {i - unit_k} else {0}))]; let e = match edit { 0 => { i -= 1; j -= 1; if a_new[i] == b_new[j] {EditType::Match} else {EditType::Mismatch} }, 1 => { j -= 1; if swap {EditType::BGap} else {EditType::AGap} }, 2 => { i -= 1; if swap {EditType::AGap} else {EditType::BGap} }, 3 => { i -= 2; j -= 2; EditType::Transpose }, _ => unreachable!() }; if res.len() > 0 && res.last().unwrap().edit == e { res.last_mut().unwrap().count += 1; }else{ res.push(Edit{edit: e, count: 1}); } } res.reverse(); Some((dp1[hi - lo - 1], Some(res))) } fn translate_str(chars: &mut Vec, s: &str) -> Option> { s.chars().map(|c| match chars.iter().position(|&d| c == d) { Some(i) => Some(i as u8), None => { let idx = chars.len(); if idx < 256 { chars.push(c); Some(idx as u8) } else { None } } }).collect() } /// Returns the Levenshtein distance, bounded by a cost threshold `k`, between two utf8 encoded strings, using /// SIMD acceleration. /// # Arguments /// * `a` - first string (&str) /// * `b` - second string (&str) /// * `k` - maximum number of edits allowed between `a` and `b` /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let dist = levenshtein_simd_k_str("abc", "ab", 1); /// /// assert!(dist.unwrap() == 1); /// ``` pub fn levenshtein_simd_k_str(a: &str, b: &str, k: u32) -> Option { if a.is_ascii() && b.is_ascii() { levenshtein_simd_k(a.as_bytes(), b.as_bytes(), k) } else { let mut chars = Vec::with_capacity(256); let a = translate_str(&mut chars, a)?; let b = translate_str(&mut chars, b)?; levenshtein_simd_k(&a, &b, k) } } /// Returns the Levenshtein distance, bounded by a cost threshold `k`, between two strings, using /// SIMD acceleration. /// /// This will return `None` if the Levenshtein distance between `a` and `b` is greater than the /// threshold `k`. /// This should be much faster than `levenshtein_naive` and `levenshtein_naive_k`. /// Internally, this will automatically use AVX or SSE vectors with 8-bit, 16-bit, or 32-bit elements /// to represent anti-diagonals in the dynamic programming matrix for calculating Levenshtein distance. /// If AVX2 or SSE4.1 is not supported, then this will automatically fall back to /// `levenshtein_naive_k_with_opts`. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// * `k` - maximum number of edits allowed between `a` and `b` /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let dist = levenshtein_simd_k(b"abc", b"ab", 1); /// /// assert!(dist.unwrap() == 1); /// ``` pub fn levenshtein_simd_k(a: &[u8], b: &[u8], k: u32) -> Option { let res = levenshtein_simd_k_with_opts(a, b, k, false, LEVENSHTEIN_COSTS); match res { Some((edits, _)) => Some(edits), None => None } } /// Returns the Levenshtein distance, bounded by a cost threshold `k`, between two strings and optionally, /// the edit traceback, using SIMD acceleration, with extra options. /// /// This will return `None` if the Levenshtein distance between `a` and `b` is greater than the /// threshold `k`. /// This should be much faster than `levenshtein_naive_with_opts` and /// `levenshtein_naive_k_with_opts`. /// Internally, this will automatically use AVX or SSE vectors with 8-bit, 16-bit, or 32-bit elements /// to represent anti-diagonals in the dynamic programming matrix for calculating Levenshtein distance. /// If AVX2 or SSE4.1 is not supported, then this will automatically fall back to /// `levenshtein_naive_k_with_opts`. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// * `k` - maximum number of cost allowed between `a` and `b` /// * `trace_on` - whether to return the traceback, the sequence of edits between `a` and `b` /// * `costs` - `EditCosts` struct for the cost of each edit operation /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let dist = levenshtein_simd_k_with_opts(b"abc", b"ab", 1, true, LEVENSHTEIN_COSTS); /// /// assert!(dist.unwrap() == (1, Some(vec![Edit{edit: EditType::Match, count: 2}, /// Edit{edit: EditType::BGap, count: 1}]))); /// ``` pub fn levenshtein_simd_k_with_opts(a: &[u8], b: &[u8], k: u32, trace_on: bool, costs: EditCosts) -> Option<(u32, Option>)> { if a.len() == 0 && b.len() == 0 { return if trace_on {Some((0u32, Some(vec![])))} else {Some((0u32, None))}; } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { let min_len = cmp::min(a.len(), b.len()) as u32; let max_len = cmp::max(a.len(), b.len()) as u32; // upper bound on the number of edits, in case k is too large let max_k = cmp::min(min_len * (costs.mismatch_cost as u32), (min_len << 1) * (costs.gap_cost as u32) + if min_len == 0 {0} else {costs.start_gap_cost as u32 + if max_len == min_len {costs.start_gap_cost as u32} else {0}}); let max_k = cmp::min(k, max_k + (max_len - min_len) * (costs.gap_cost as u32) + if max_len == min_len {0} else {costs.start_gap_cost as u32}); // farthest we can stray from the main diagonal // have to start at least one gap let unit_k = cmp::min(max_k.saturating_sub(costs.start_gap_cost as u32) / (costs.gap_cost as u32), max_len); // note: do not use the MAX value, because it indicates overflow/inaccuracy if cfg!(feature = "jewel-avx") && is_x86_feature_detected!("avx2") { if cfg!(feature = "jewel-8bit") && unit_k <= (Avx1x32x8::static_upper_bound() as u32 - 2) && max_k <= ((u8::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_avx_1x32x8(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-8bit") && unit_k <= (Avx2x32x8::static_upper_bound() as u32 - 2) && max_k <= ((u8::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_avx_2x32x8(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-8bit") && unit_k <= (Avx4x32x8::static_upper_bound() as u32 - 2) && max_k <= ((u8::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_avx_4x32x8(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-8bit") && unit_k <= (Avx8x32x8::static_upper_bound() as u32 - 2) && max_k <= ((u8::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_avx_8x32x8(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-16bit") && max_k <= ((u16::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_avx_nx16x16(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-32bit") { return unsafe {levenshtein_simd_core_avx_nx8x32(a, b, max_k, trace_on, costs)}; } }else if cfg!(feature = "jewel-sse") && is_x86_feature_detected!("sse4.1") { if cfg!(feature = "jewel-8bit") && unit_k <= (Sse1x16x8::static_upper_bound() as u32 - 2) && max_k <= ((u8::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_sse_1x16x8(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-8bit") && unit_k <= (Sse2x16x8::static_upper_bound() as u32 - 2) && max_k <= ((u8::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_sse_2x16x8(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-8bit") && unit_k <= (Sse4x16x8::static_upper_bound() as u32 - 2) && max_k <= ((u8::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_sse_4x16x8(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-8bit") && unit_k <= (Sse8x16x8::static_upper_bound() as u32 - 2) && max_k <= ((u8::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_sse_8x16x8(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-8bit") && unit_k <= (Sse16x16x8::static_upper_bound() as u32 - 2) && max_k <= ((u8::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_sse_16x16x8(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-16bit") && max_k <= ((u16::MAX - 1) as u32) { return unsafe {levenshtein_simd_core_sse_nx8x16(a, b, max_k, trace_on, costs)}; }else if cfg!(feature = "jewel-32bit") { return unsafe {levenshtein_simd_core_sse_nx4x32(a, b, max_k, trace_on, costs)}; } } } levenshtein_naive_k_with_opts(a, b, k, trace_on, costs) } macro_rules! create_levenshtein_simd_core { ($name:ident, $traceback_name:ident, $jewel:ty, $target:literal) => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = $target)] unsafe fn $name(a_old: &[u8], b_old: &[u8], k: u32, trace_on: bool, costs: EditCosts) -> Option<(u32, Option>)> { #[cfg(feature = "debug")] { println!("Debug: Levenshtein Jewel vector type {} for target {}.", stringify!($jewel), stringify!($target)); } // swap a and b so that a is shorter than b, if applicable // makes operations later on slightly easier, since length of a <= length of b let swap = a_old.len() > b_old.len(); let a = if swap {b_old} else {a_old}; let a_len = a.len(); let b = if swap {a_old} else {b_old}; let b_len = b.len(); let unit_k = cmp::min((k.saturating_sub(costs.start_gap_cost as u32) / (costs.gap_cost as u32)) as usize, b_len); if b_len - a_len > unit_k { return None; } // initialized with max values // must use saturated additions afterwards to not overflow let mut dp1 = <$jewel>::repeating_max((unit_k + 2) as usize); let max_len = dp1.upper_bound(); let mut dp2 = <$jewel>::repeating_max(max_len); let mut dp0 = <$jewel>::repeating_max(max_len); let mut dp_temp = <$jewel>::repeating_max(max_len); // dp0 -> dp_temp -> dp1 -> dp2 -> current diagonal // dp for whether to extend gap or start new gap let mut a_gap_dp = <$jewel>::repeating_max(max_len); let mut b_gap_dp = <$jewel>::repeating_max(max_len); // lengths of the (anti) diagonals // assumes max_len is even let k1 = max_len - 1; let k1_div2 = k1 >> 1; let k2 = max_len - 2; let k2_div2 = k2 >> 1; // set dp[0][0] = 0 dp1.slow_insert(k1_div2, 0); // set dp[0][1] = start_gap_cost + gap_cost and dp[1][0] = start_gap_cost + gap_cost dp2.slow_insert(k2_div2 - 1, costs.start_gap_cost as u32 + costs.gap_cost as u32); dp2.slow_insert(k2_div2, costs.start_gap_cost as u32 + costs.gap_cost as u32); b_gap_dp.slow_insert(k2_div2 - 1, costs.start_gap_cost as u32 + costs.gap_cost as u32); a_gap_dp.slow_insert(k2_div2, costs.start_gap_cost as u32 + costs.gap_cost as u32); // a_k1_window and a_k2_window represent reversed portions of the string a // copy in half of k1/k2 number of characters // these characters are placed in the second half of b windows // since a windows are reversed, the characters are placed in reverse in the first half of b windows let mut a_k1_window = <$jewel>::repeating(0, max_len); a_k1_window.slow_loadu(k1_div2 - 1, a.as_ptr(), cmp::min(k1_div2, a_len), true); let mut b_k1_window = <$jewel>::repeating(0, max_len); b_k1_window.slow_loadu(k1_div2 + 1, b.as_ptr(), cmp::min(k1_div2, b_len), false); let mut a_k2_window = <$jewel>::repeating(0, max_len); a_k2_window.slow_loadu(k2_div2 - 1, a.as_ptr(), cmp::min(k2_div2, a_len), true); let mut b_k2_window = <$jewel>::repeating(0, max_len); b_k2_window.slow_loadu(k2_div2, b.as_ptr(), cmp::min(k2_div2, b_len), false); // used to keep track of the next characters to place in the windows let mut k1_idx = k1_div2 - 1; let mut k2_idx = k2_div2 - 1; let len_diff = b_len - a_len; let len = a_len + b_len + 1; let len_div2 = (len >> 1) + (len & 1); let ends_with_k2 = len & 1 == 0; // every diff between the length of a and b results in a shift from the main diagonal let final_idx = { if ends_with_k2 { // divisible by 2, ends with k2 k2_div2 + ((len_diff - 1) >> 1) }else{ // not divisible by 2, ends with k1 k1_div2 + (len_diff >> 1) } }; // 0 = match/mismatch, 1 = a gap, 2 = b gap, 3 = transpose let mut traceback_arr = if trace_on {Vec::with_capacity(len + (len & 1))} else {vec![]}; if trace_on { traceback_arr.push(<$jewel>::repeating(0, max_len)); traceback_arr.push(<$jewel>::repeating(0, max_len)); traceback_arr.get_unchecked_mut(1).slow_insert(k2_div2 - 1, 2); traceback_arr.get_unchecked_mut(1).slow_insert(k2_div2, 1); } // reusable constant let threes = <$jewel>::repeating(3, max_len); // used in calculations let mut sub = <$jewel>::repeating(0, max_len); let mut match_mask0 = <$jewel>::repeating(0, max_len); let mut match_mask1 = <$jewel>::repeating(0, max_len); let mut a_gap = <$jewel>::repeating(0, max_len); let mut b_gap = <$jewel>::repeating(0, max_len); let mut transpose = <$jewel>::repeating(0, max_len); let mismatch_cost = <$jewel>::repeating(costs.mismatch_cost as u32, max_len); let gap_cost = <$jewel>::repeating(costs.gap_cost as u32, max_len); let start_gap_cost = <$jewel>::repeating(costs.start_gap_cost as u32 + costs.gap_cost as u32, max_len); let transpose_cost = match costs.transpose_cost { Some(cost) => <$jewel>::repeating(cost as u32, max_len), None => <$jewel>::repeating(0, max_len) // value does not matter }; let allow_transpose = costs.transpose_cost.is_some(); // example: allow k = 2 edits for two strings of length 3 // -b-- // | xx */* // a x /*/* // | */*/ x // | */* xx // // each (anti) diagonal is represented with '*' or '/' // '/', use k2 = 2 // '*', use k1 = 3 // 'x' represents cells not in the "traditional" dp array // these out of bounds dp cells are shown because they represent // a horizontal sliding window of length 5 (2 * k + 1) // // dp2 is one diagonal before current // dp1 is two diagonals before current // dp0 is four diagonals before current // dp0 is useful for transpositions // we are trying to calculate the "current" diagonal // note that a k1 '*' dp diagonal has its center cell on the main diagonal // in general, the diagonals are centered on the main diagonal // each diagonal is represented using a Jewel vector // each vector goes from bottom-left to top-right // // the a windows and b windows are queues of a fixed length // a is reversed, so that elementwise comparison can be done between a and b // this operation obtains the comparison of characters along the (anti) diagonal // if transpositions are allowed, then previous match_masks must be saved to calculate // a[i - 1] == b[j] and a[i] == b[j - 1] // for speed, transpositions are done by directly blending using the mask, without calculating // the minimum cost compared to the other edit operations // // example of moving the windows: // a windows: [5 4 3 2 1] -> [6 5 4 3 2] (right shift + insert) // b windows: [1 2 3 4 5] -> [2 3 4 5 6] (left shift + insert) // // initially: // a windows: [2 1 0 0 0] // b windows: [0 0 0 1 2] // // note that there will be left over cells not filled in the Jewel vector // this is because k1 and k2 are not long enough // all of these empty cells should be at the end of the SIMD vectors // // each iteration of the loop below results in processing both a k1 diagonal and a k2 diagonal // this could be done with an alternating state flag but it is unrolled for less branching // // note: in traditional dp array // dp[i][j] -> dp[i + 1][j] is a gap in string b // dp[i][j] -> dp[i][j + 1] is a gap in string a for _ in 1..len_div2 { // move indexes in strings forward k1_idx += 1; k2_idx += 1; // move windows for the strings a and b a_k1_window.shift_right_1_mut(); if k1_idx < a_len { a_k1_window.insert_first(*a.get_unchecked(k1_idx) as u32); } b_k1_window.shift_left_1_mut(); if k1_idx < b_len { b_k1_window.insert_last_1(*b.get_unchecked(k1_idx) as u32); // k1 - 1 } a_k2_window.shift_right_1_mut(); if k2_idx < a_len { a_k2_window.insert_first(*a.get_unchecked(k2_idx) as u32); } b_k2_window.shift_left_1_mut(); if k2_idx < b_len { b_k2_window.insert_last_2(*b.get_unchecked(k2_idx) as u32); // k2 - 1 } // (anti) diagonal that matches in the a and b windows <$jewel>::cmpeq(&a_k1_window, &b_k1_window, &mut match_mask1); <$jewel>::andnot(&match_mask1, &mismatch_cost, &mut sub); sub.adds_mut(&dp1); // cost of gaps in a // start new gap <$jewel>::adds(&dp2, &start_gap_cost, &mut a_gap); // continue gap a_gap_dp.adds_mut(&gap_cost); a_gap_dp.min_mut(&a_gap); a_gap_dp.shift_right_1_mut(); a_gap_dp.insert_first_max(); // cost of gaps in b // start new gap <$jewel>::adds(&dp2, &start_gap_cost, &mut b_gap); // continue gap b_gap_dp.adds_mut(&gap_cost); b_gap_dp.min_mut(&b_gap); if allow_transpose { <$jewel>::shift_right_1(&match_mask0, &mut transpose); // reuse transpose, zeros shifted in transpose.and_mut(&match_mask0); // make sure that current matching locations are excluded <$jewel>::andnot(&match_mask1, &transpose, &mut match_mask0); // reuse match_mask0 to represent transpose mask <$jewel>::adds(&dp0, &transpose_cost, &mut transpose); } // min of the cost of all three edit operations if trace_on { let mut args = <$jewel>::triple_argmin(&sub, &a_gap_dp, &b_gap_dp, &mut dp0); if allow_transpose { // blend using transpose mask dp0.blendv_mut(&transpose, &match_mask0); args.blendv_mut(&threes, &match_mask0); mem::swap(&mut match_mask0, &mut match_mask1); } traceback_arr.push(args); }else{ <$jewel>::min(&a_gap_dp, &b_gap_dp, &mut dp0); dp0.min_mut(&sub); if allow_transpose { // blend using transpose mask dp0.blendv_mut(&transpose, &match_mask0); mem::swap(&mut match_mask0, &mut match_mask1); } } mem::swap(&mut dp0, &mut dp_temp); mem::swap(&mut dp_temp, &mut dp1); mem::swap(&mut dp1, &mut dp2); // (anti) diagonal that matches in the a and b windows <$jewel>::cmpeq(&a_k2_window, &b_k2_window, &mut match_mask1); <$jewel>::andnot(&match_mask1, &mismatch_cost, &mut sub); sub.adds_mut(&dp1); // cost of gaps in b // start new gap <$jewel>::adds(&dp2, &start_gap_cost, &mut b_gap); // continue gap b_gap_dp.adds_mut(&gap_cost); b_gap_dp.min_mut(&b_gap); b_gap_dp.shift_left_1_mut(); b_gap_dp.insert_last_max(); // k1, shift in max value // cost of gaps in a // start new gap <$jewel>::adds(&dp2, &start_gap_cost, &mut a_gap); a_gap_dp.adds_mut(&gap_cost); // continue gap a_gap_dp.min_mut(&a_gap); if allow_transpose { <$jewel>::shift_left_1(&match_mask0, &mut transpose); // reuse transpose, zeros shifted in transpose.and_mut(&match_mask0); // make sure that current matching locations are excluded <$jewel>::andnot(&match_mask1, &transpose, &mut match_mask0); // reuse match_mask0 to represent transpose mask <$jewel>::adds(&dp0, &transpose_cost, &mut transpose); } // min of the cost of all three edit operations if trace_on { let mut args = <$jewel>::triple_argmin(&sub, &a_gap_dp, &b_gap_dp, &mut dp0); if allow_transpose { // blend using transpose mask dp0.blendv_mut(&transpose, &match_mask0); args.blendv_mut(&threes, &match_mask0); mem::swap(&mut match_mask0, &mut match_mask1); } traceback_arr.push(args); }else{ <$jewel>::min(&a_gap_dp, &b_gap_dp, &mut dp0); dp0.min_mut(&sub); if allow_transpose { // blend using transpose mask dp0.blendv_mut(&transpose, &match_mask0); mem::swap(&mut match_mask0, &mut match_mask1); } } mem::swap(&mut dp0, &mut dp_temp); mem::swap(&mut dp_temp, &mut dp1); mem::swap(&mut dp1, &mut dp2); } let final_res = if ends_with_k2 { dp2.slow_extract(final_idx) }else{ dp1.slow_extract(final_idx) }; if final_res > k { return None; } if !trace_on { return Some((final_res, None)); } // upper bound the number of edit operations, to reduce memory allocations for saving the traceback let mut upper_bound_edits = final_res / (cmp::min(costs.mismatch_cost, costs.gap_cost) as u32); if let Some(cost) = costs.transpose_cost { upper_bound_edits = cmp::max(upper_bound_edits, (final_res >> 1) / (cost as u32) + 1); } Some((final_res, Some($traceback_name(&traceback_arr, upper_bound_edits as usize, final_idx, a, b, swap, ends_with_k2)))) } unsafe fn $traceback_name(arr: &[$jewel], k: usize, mut idx: usize, a: &[u8], b: &[u8], swap: bool, mut is_k2: bool) -> Vec { // keep track of position in traditional dp array and strings let mut i = a.len(); // index in a let mut j = b.len(); // index in b // last diagonal may overshoot, so ignore it let mut arr_idx = arr.len() - 1 - (if is_k2 {0} else {1}); let mut res: Vec = Vec::with_capacity((k << 1) + 1); while arr_idx > 0 { // each Jewel vector in arr is only visited once, so extract (which is costly) is fine let edit = arr.get_unchecked(arr_idx).slow_extract(idx); let e = match edit { 0 => { // match/mismatch arr_idx -= 2; i -= 1; j -= 1; if *a.get_unchecked(i) == *b.get_unchecked(j) {EditType::Match} else {EditType::Mismatch} }, 1 => { // a gap arr_idx -= 1; if !is_k2 { idx -= 1; } j -= 1; is_k2 = !is_k2; // must account for alternating k1/k2 diagonals if swap {EditType::BGap} else {EditType::AGap} // account for the swap in the beginning }, 2 => { // b gap arr_idx -= 1; if is_k2 { idx += 1; } i -= 1; is_k2 = !is_k2; if swap {EditType::AGap} else {EditType::BGap} }, 3 => { // transpose arr_idx -= 4; i -= 2; j -= 2; EditType::Transpose }, _ => unreachable!() }; if res.len() > 0 && res.last().unwrap().edit == e { res.last_mut().unwrap().count += 1; }else{ res.push(Edit{edit: e, count: 1}); } } res.reverse(); res } }; } // create a version of the functions for each Jewel vector #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_avx_1x32x8, traceback_avx_1x32x8, Avx1x32x8, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_avx_2x32x8, traceback_avx_2x32x8, Avx2x32x8, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_avx_4x32x8, traceback_avx_4x32x8, Avx4x32x8, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_avx_8x32x8, traceback_avx_8x32x8, Avx8x32x8, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_avx_nx16x16, traceback_avx_nx16x16, AvxNx16x16, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_avx_nx8x32, traceback_avx_nx8x32, AvxNx8x32, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_sse_1x16x8, traceback_sse_1x16x8, Sse1x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_sse_2x16x8, traceback_sse_2x16x8, Sse2x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_sse_4x16x8, traceback_sse_4x16x8, Sse4x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_sse_8x16x8, traceback_sse_8x16x8, Sse8x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_sse_16x16x8, traceback_sse_16x16x8, Sse16x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_sse_nx8x16, traceback_sse_nx8x16, SseNx8x16, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_simd_core!(levenshtein_simd_core_sse_nx4x32, traceback_sse_nx4x32, SseNx4x32, "sse4.1"); /// Returns the Levenshtein distance between two strings using SIMD acceleration. /// /// Note that `levenshtein_exp` may be much faster if the number of edits between the two strings /// is expected to be small. /// Internally, this will call `levenshtein_simd_k`. /// If AVX2 or SSE4.1 is not supported, then this will automatically fall back to a scalar alternative. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Example /// ``` /// # use triple_accel::*; /// let dist = levenshtein(b"abc", b"ab"); /// /// assert!(dist == 1); /// ``` pub fn levenshtein(a: &[u8], b: &[u8]) -> u32 { levenshtein_simd_k(a, b, u32::MAX).unwrap() } /// Returns the restricted Damerau-Levenshtein distance between two strings using SIMD acceleration. /// /// Note that `rdamerau_exp` may be much faster if the number of edits between the two strings /// is expected to be small. /// Internally, this will call `levenshtein_simd_k_with_opts`. /// If AVX2 or SSE4.1 is not supported, then this will automatically fall back to a scalar alternative. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Example /// ``` /// # use triple_accel::*; /// let dist = rdamerau(b"abc", b"acb"); /// /// assert!(dist == 1); /// ``` pub fn rdamerau(a: &[u8], b: &[u8]) -> u32 { levenshtein_simd_k_with_opts(a, b, u32::MAX, false, RDAMERAU_COSTS).unwrap().0 } /// Returns the Levenshtein distance between two strings using exponential search and SIMD /// acceleration. /// /// This may be much more efficient than `levenshtein` if the number of edits between `a` and `b` /// is expected to be small. /// Internally, this will call `levenshtein_simd_k` with values of `k` determined through /// exponential search. /// If AVX2 or SSE4.1 is not supported, then this will automatically fall back to a scalar alternative. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Example /// ``` /// # use triple_accel::*; /// let dist = levenshtein_exp(b"abc", b"ab"); /// /// assert!(dist == 1); /// ``` pub fn levenshtein_exp(a: &[u8], b: &[u8]) -> u32 { let mut k = 30; let mut res = levenshtein_simd_k(a, b, k); // exponential search while res.is_none() { k <<= 1; res = levenshtein_simd_k(a, b, k); } // should not panic res.unwrap() } /// Returns the restricted Damerau-Levenshtein distance between two strings using exponential /// search and SIMD acceleration. /// /// This may be much more efficient than `rdamerau` if the number of edits between `a` and `b` /// is expected to be small. /// Internally, this will call `levenshtein_simd_k_with_opts` with values of `k` determined through /// exponential search. /// If AVX2 or SSE4.1 is not supported, then this will automatically fall back to a scalar alternative. /// /// # Arguments /// * `a` - first string (slice) /// * `b` - second string (slice) /// /// # Example /// ``` /// # use triple_accel::*; /// let dist = rdamerau_exp(b"abc", b"acb"); /// /// assert!(dist == 1); /// ``` pub fn rdamerau_exp(a: &[u8], b: &[u8]) -> u32 { let mut k = 30; let mut res = levenshtein_simd_k_with_opts(a, b, k, false, RDAMERAU_COSTS); // exponential search while res.is_none() { k <<= 1; res = levenshtein_simd_k_with_opts(a, b, k, false, RDAMERAU_COSTS); } // should not panic res.unwrap().0 } /// Returns an iterator over the best `Match`s by searching through the text `haystack` for the /// pattern `needle` using the naive algorithm. /// /// The best matches are the matches with the lowest Levenshtein distance. /// If multiple best matches end at the same position or fully overlap, then the longest match is chosen. /// If `needle` is empty, then no `Match`es are returned. /// Each returned `Match` requires at least half or more bytes of the `needle` to match /// somewhere in the `haystack`. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let matches: Vec = levenshtein_search_naive(b"abc", b" abd").collect(); /// /// assert!(matches == vec![Match{start: 2, end: 5, k: 1}]); /// ``` pub fn levenshtein_search_naive<'a>(needle: &'a [u8], haystack: &'a [u8]) -> Box + 'a> { levenshtein_search_naive_with_opts(needle, haystack, ((needle.len() as u32) >> 1) + ((needle.len() as u32) & 1), SearchType::Best, LEVENSHTEIN_COSTS, false) } /// Returns an iterator over `Match`s by searching through the text `haystack` for the /// pattern `needle` using the naive algorithm, with extra options. /// /// Note that overlapping matches may be returned. /// If multiple matches end at the same position, then the longest match is chosen. /// If `needle` is empty and `anchored` is false, then no `Match`es are returned. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// * `k` - maximum cost threshold for a match to be returned /// * `search_type` - indicates whether to return all matches (within a cost of `k`), or the best matches with /// the lowest cost (additionally, only the longest matches are retained for matches that fully overlap) /// * `costs` - `EditCosts` struct for the cost of each edit operation /// * `anchored` - whether the `needle` should be anchored to the start of the `haystack` string, /// causing any shifts to cost gap edits /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let matches: Vec = levenshtein_search_naive_with_opts(b"abc", b" acb", 1, SearchType::All, RDAMERAU_COSTS, false).collect(); /// /// // note: it is possible to end the match at two different positions /// assert!(matches == vec![Match{start: 2, end: 4, k: 1}, Match{start: 2, end: 5, k: 1}]); /// ``` pub fn levenshtein_search_naive_with_opts<'a>(needle: &'a [u8], haystack: &'a [u8], k: u32, search_type: SearchType, costs: EditCosts, anchored: bool) -> Box + 'a> { let needle_len = needle.len(); let haystack_len = haystack.len(); if needle_len == 0 { // special case when anchored is true: return possible matches if anchored { return match search_type { SearchType::All => { let mut i = 0; let mut cost = costs.start_gap_cost as u32; let mut start = true; Box::new(iter::from_fn(move || { if start { start = false; return Some(Match{start: 0, end: 0, k: 0}); } if i < haystack_len { i += 1; cost += costs.gap_cost as u32; if cost <= k { return Some(Match{start: 0, end: i, k: cost}); } } None })) }, SearchType::Best => Box::new(iter::once(Match{start: 0, end: 0, k: 0})) }; }else{ return Box::new(iter::empty()); } } // enforce another constraint on the costs costs.check_search(); let len = needle_len + 1; let iter_len = if anchored { // start_gap_cost must be incurred at least once cmp::min(haystack_len, needle_len.saturating_add( (k.saturating_sub(costs.start_gap_cost as u32) as usize) / (costs.gap_cost as usize))) }else{ haystack_len }; let mut dp0 = vec![0u32; len]; let mut dp1 = vec![0u32; len]; let mut dp2 = vec![0u32; len]; let mut needle_gap_dp = vec![u32::MAX; len]; let mut haystack_gap_dp = vec![u32::MAX; len]; let mut length0 = vec![0usize; len]; let mut length1 = vec![0usize; len]; let mut length2 = vec![0usize; len]; let mut needle_gap_length = vec![0usize; len]; let mut haystack_gap_length = vec![0usize; len]; let mut curr_k = k; let mismatch_cost = costs.mismatch_cost as u32; let gap_cost = costs.gap_cost as u32; let start_gap_cost = costs.start_gap_cost as u32; let transpose_cost = match costs.transpose_cost { Some(cost) => cost as u32, None => 0 }; let allow_transpose = costs.transpose_cost.is_some(); let mut first = true; let mut i = 0; let res = iter::from_fn(move || { if first { first = false; for j in 0..len { dp1[j] = (j as u32) * gap_cost + if j == 0 {0} else {start_gap_cost}; } if dp1[len - 1] <= curr_k { if search_type == SearchType::Best { curr_k = dp1[len - 1]; } return Some((Match{start: 0, end: 0, k: dp1[len - 1]}, curr_k)); } } while i < iter_len { needle_gap_dp[0] = if anchored {(i as u32 + 1) * gap_cost + start_gap_cost} else {0}; dp2[0] = if anchored {(i as u32 + 1) * gap_cost + start_gap_cost} else {0}; needle_gap_length[0] = 0; length2[0] = 0; for j in 1..len { let sub = dp1[j - 1] + ((needle[j - 1] != haystack[i]) as u32) * mismatch_cost; let new_gap = dp1[j] + start_gap_cost + gap_cost; let cont_gap = needle_gap_dp[j].saturating_add(gap_cost); if new_gap < cont_gap { needle_gap_dp[j] = new_gap; needle_gap_length[j] = length1[j] + 1; }else if new_gap > cont_gap { needle_gap_dp[j] = cont_gap; needle_gap_length[j] += 1; }else{ needle_gap_dp[j] = cont_gap; needle_gap_length[j] = cmp::max(length1[j], needle_gap_length[j]) + 1; } let new_gap = dp2[j - 1] + start_gap_cost + gap_cost; let cont_gap = haystack_gap_dp[j - 1].saturating_add(gap_cost); if new_gap < cont_gap { haystack_gap_dp[j] = new_gap; haystack_gap_length[j] = length2[j - 1]; }else if new_gap > cont_gap { haystack_gap_dp[j] = cont_gap; haystack_gap_length[j] = haystack_gap_length[j - 1]; }else{ haystack_gap_dp[j] = cont_gap; haystack_gap_length[j] = cmp::max(length2[j - 1], haystack_gap_length[j - 1]); } dp2[j] = needle_gap_dp[j]; length2[j] = needle_gap_length[j]; if (haystack_gap_dp[j] < dp2[j]) || (haystack_gap_dp[j] == dp2[j] && length2[j - 1] > length2[j]) { dp2[j] = haystack_gap_dp[j]; length2[j] = haystack_gap_length[j]; } if (sub < dp2[j]) || (sub == dp2[j] && (length1[j - 1] + 1) > length2[j]) { dp2[j] = sub; length2[j] = length1[j - 1] + 1; } if allow_transpose && i > 0 && j > 1 && needle[j - 1] == haystack[i - 1] && needle[j - 2] == haystack[i] { let transpose = dp0[j - 2] + transpose_cost; if transpose <= dp2[j] { dp2[j] = transpose; length2[j] = length0[j - 2] + 2; } } } let final_res = dp2[len - 1]; let final_length = length2[len - 1]; mem::swap(&mut dp0, &mut dp1); mem::swap(&mut dp1, &mut dp2); mem::swap(&mut length0, &mut length1); mem::swap(&mut length1, &mut length2); i += 1; if final_res <= curr_k { match search_type { SearchType::Best => curr_k = final_res, _ => () } return Some((Match{start: i - final_length, end: i, k: final_res}, curr_k)); } } None }); if search_type == SearchType::Best { // estimate the number of Matches let mut res_vec = Vec::with_capacity(iter_len / needle_len); for m in res { match res_vec.len() { 0 => res_vec.push(m.0), _ => { let last = res_vec.last_mut().unwrap(); // replace previous if fully overlapping if m.0.start <= last.start { *last = m.0; }else{ res_vec.push(m.0); } } } curr_k = m.1; } return Box::new(res_vec.into_iter().filter(move |m| m.k == curr_k)); } Box::new(res.map(|m| m.0)) } /// Returns an iterator over the best `Match`s by searching through the text `haystack` for the /// pattern `needle` using SIMD acceleration. /// /// The best matches are the matches with the lowest Levenshtein distance. /// If multiple best matches end at the same position or fully overlap, then the longest match is chosen. /// If `needle` is empty, then no `Match`es are returned. /// Each returned `Match` requires at least half or more bytes of the `needle` to match /// somewhere in the `haystack`. /// This should be much faster than `levenshtein_search_naive`. /// Internally, this will automatically use AVX or SSE vectors with 8-bit, 16-bit, or 32-bit elements /// to represent anti-diagonals in the dynamic programming matrix for calculating Levenshtein distance. /// If AVX2 or SSE4.1 is not supported, then this will automatically fall back to /// `levenshtein_search_naive_with_opts`. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let matches: Vec = levenshtein_search_simd(b"abc", b" abd").collect(); /// /// assert!(matches == vec![Match{start: 2, end: 5, k: 1}]); /// ``` pub fn levenshtein_search_simd<'a>(needle: &'a [u8], haystack: &'a [u8]) -> Box + 'a> { levenshtein_search_simd_with_opts(needle, haystack, ((needle.len() >> 1) as u32) + ((needle.len() as u32) & 1), SearchType::Best, LEVENSHTEIN_COSTS, false) } /// Returns an iterator over `Match`s by searching through the text `haystack` for the /// pattern `needle` using SIMD acceleration, with extra options. /// /// Note that overlapping matches may be returned. /// If multiple matches end at the same position, then the longest match is chosen. /// If `needle` is empty and `anchored` is false, then no `Match`es are returned. /// This should be much faster than `levenshtein_search_naive_with_opts`. /// Internally, this will automatically use AVX or SSE vectors with 8-bit, 16-bit, or 32-bit elements /// to represent anti-diagonals in the dynamic programming matrix for calculating Levenshtein distance. /// If AVX2 or SSE4.1 is not supported, then this will automatically fall back to /// `levenshtein_search_naive_with_opts`. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// * `k` - maximum cost threshold for a match to be returned /// * `search_type` - indicates whether to return all matches (within a cost of `k`), or the best matches with /// the lowest cost (additionally, only the longest matches are retained for matches that fully overlap) /// * `costs` - `EditCosts` struct for the cost of each edit operation /// * `anchored` - whether the `needle` should be anchored to the start of the `haystack` string, /// causing any shifts to cost gap edits /// /// # Example /// ``` /// # use triple_accel::*; /// # use triple_accel::levenshtein::*; /// let matches: Vec = levenshtein_search_simd_with_opts(b"abc", b" acb", 1, SearchType::All, RDAMERAU_COSTS, false).collect(); /// /// // note: it is possible to end the match at two different positions /// assert!(matches == vec![Match{start: 2, end: 4, k: 1}, Match{start: 2, end: 5, k: 1}]); /// ``` pub fn levenshtein_search_simd_with_opts<'a>(needle: &'a [u8], haystack: &'a [u8], k: u32, search_type: SearchType, costs: EditCosts, anchored: bool) -> Box + 'a> { if needle.len() == 0 { // special case when anchored is true: return possible matches if anchored { return match search_type { SearchType::All => { let mut i = 0; let mut cost = costs.start_gap_cost as u32; let mut start = true; Box::new(iter::from_fn(move || { if start { start = false; return Some(Match{start: 0, end: 0, k: 0}); } if i < haystack.len() { i += 1; cost += costs.gap_cost as u32; if cost <= k { return Some(Match{start: 0, end: i, k: cost}); } } None })) }, SearchType::Best => Box::new(iter::once(Match{start: 0, end: 0, k: 0})) }; }else{ return Box::new(iter::empty()); } } costs.check_search(); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { let unit_k = k.saturating_sub(costs.start_gap_cost as u32) / (costs.gap_cost as u32); // either the length of the match or the number of edits may exceed the maximum // available int size; additionally, MAX value is used to indicate overflow let upper_bound = cmp::max((needle.len() as u32).saturating_add(unit_k), k.saturating_add(1)); if cfg!(feature = "jewel-avx") && is_x86_feature_detected!("avx2") { if cfg!(feature = "jewel-8bit") && needle.len() <= Avx1x32x8::static_upper_bound() && upper_bound <= u8::MAX as u32 { return unsafe {levenshtein_search_simd_core_avx_1x32x8(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-8bit") && needle.len() <= Avx2x32x8::static_upper_bound() && upper_bound <= u8::MAX as u32 { return unsafe {levenshtein_search_simd_core_avx_2x32x8(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-8bit") && needle.len() <= Avx4x32x8::static_upper_bound() && upper_bound <= u8::MAX as u32 { return unsafe {levenshtein_search_simd_core_avx_4x32x8(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-8bit") && needle.len() <= Avx8x32x8::static_upper_bound() && upper_bound <= u8::MAX as u32 { return unsafe {levenshtein_search_simd_core_avx_8x32x8(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-16bit") && upper_bound <= u16::MAX as u32 { return unsafe {levenshtein_search_simd_core_avx_nx16x16(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-32bit") { return unsafe {levenshtein_search_simd_core_avx_nx8x32(needle, haystack, k, search_type, costs, anchored)}; } }else if cfg!(feature = "jewel-sse") && is_x86_feature_detected!("sse4.1") { if cfg!(feature = "jewel-8bit") && needle.len() <= Sse1x16x8::static_upper_bound() && upper_bound <= u8::MAX as u32 { return unsafe {levenshtein_search_simd_core_sse_1x16x8(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-8bit") && needle.len() <= Sse2x16x8::static_upper_bound() && upper_bound <= u8::MAX as u32 { return unsafe {levenshtein_search_simd_core_sse_2x16x8(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-8bit") && needle.len() <= Sse4x16x8::static_upper_bound() && upper_bound <= u8::MAX as u32 { return unsafe {levenshtein_search_simd_core_sse_4x16x8(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-8bit") && needle.len() <= Sse8x16x8::static_upper_bound() && upper_bound <= u8::MAX as u32 { return unsafe {levenshtein_search_simd_core_sse_8x16x8(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-8bit") && needle.len() <= Sse16x16x8::static_upper_bound() && upper_bound <= u8::MAX as u32 { return unsafe {levenshtein_search_simd_core_sse_16x16x8(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-16bit") && upper_bound <= u16::MAX as u32 { return unsafe {levenshtein_search_simd_core_sse_nx8x16(needle, haystack, k, search_type, costs, anchored)}; }else if cfg!(feature = "jewel-32bit") { return unsafe {levenshtein_search_simd_core_sse_nx4x32(needle, haystack, k, search_type, costs, anchored)}; } } } levenshtein_search_naive_with_opts(needle, haystack, k, search_type, costs, anchored) } macro_rules! create_levenshtein_search_simd_core { ($name:ident, $jewel:ty, $target:literal) => { #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = $target)] unsafe fn $name<'a>(needle: &'a [u8], haystack: &'a [u8], k: u32, search_type: SearchType, costs: EditCosts, anchored: bool) -> Box + 'a> { #[cfg(feature = "debug")] { println!("Debug: Levenshtein search Jewel vector type {} for target {}.", stringify!($jewel), stringify!($target)); } let needle_len = needle.len(); let haystack_len = haystack.len(); let mut dp0 = <$jewel>::repeating_max(needle_len); let mut dp_temp = <$jewel>::repeating_max(needle_len); let mut dp1 = <$jewel>::repeating_max(needle_len); let mut dp2 = <$jewel>::repeating_max(needle_len); let mut needle_gap_dp = <$jewel>::repeating_max(needle_len); let mut haystack_gap_dp = <$jewel>::repeating_max(needle_len); dp2.slow_insert(dp2.upper_bound() - 1, costs.start_gap_cost as u32 + costs.gap_cost as u32); // last cell haystack_gap_dp.slow_insert(dp2.upper_bound() - 1, costs.start_gap_cost as u32 + costs.gap_cost as u32); // save length instead of start idx due to int size constraints let mut length0 = <$jewel>::repeating(0, needle_len); let mut length_temp = <$jewel>::repeating(0, needle_len); let mut length1 = <$jewel>::repeating(0, needle_len); let mut length2 = <$jewel>::repeating(0, needle_len); let mut needle_gap_length = <$jewel>::repeating(0, needle_len); let mut haystack_gap_length = <$jewel>::repeating(0, needle_len); let ones = <$jewel>::repeating(1, needle_len); let twos = <$jewel>::repeating(2, needle_len); // the suffix of haystack can be ignored if needle must be anchored let len = if anchored { needle_len + cmp::min(haystack_len, needle_len.saturating_add( (k.saturating_sub(costs.start_gap_cost as u32) as usize) / (costs.gap_cost as usize))) }else{ needle_len + haystack_len }; let final_idx = dp1.upper_bound() - needle_len; // load needle characters into needle_window in reversed order let mut needle_window = <$jewel>::repeating(0, needle_len); needle_window.slow_loadu(needle_window.upper_bound() - 1, needle.as_ptr(), needle_len, true); let mut haystack_window = <$jewel>::repeating(0, needle_len); let mut haystack_idx = 0usize; let mut curr_k = k; // used in calculations let mut match_mask0 = <$jewel>::repeating(0, needle_len); let mut match_mask1 = <$jewel>::repeating(0, needle_len); let mut match_mask_cost = <$jewel>::repeating(0, needle_len); let mut sub = <$jewel>::repeating(0, needle_len); let mut sub_length = <$jewel>::repeating(0, needle_len); let mut needle_gap = <$jewel>::repeating(0, needle_len); let mut haystack_gap = <$jewel>::repeating(0, needle_len); let mut transpose = <$jewel>::repeating(0, needle_len); let mut transpose_length = <$jewel>::repeating(0, needle_len); let mismatch_cost = <$jewel>::repeating(costs.mismatch_cost as u32, needle_len); let gap_cost = <$jewel>::repeating(costs.gap_cost as u32, needle_len); let start_gap_cost = <$jewel>::repeating(costs.start_gap_cost as u32, needle_len); let transpose_cost = match costs.transpose_cost { Some(cost) => <$jewel>::repeating(cost as u32, needle_len), None => <$jewel>::repeating(0, needle_len) }; let allow_transpose = costs.transpose_cost.is_some(); // ..i... // --h--- // | //////xxxx // | x//////xxx // n xx//////xx // | xxx//////x // | xxxx////// // // 'n' = needle, 'h' = haystack // each (anti) diagonal is denoted using '/' and 'x' // 'x' marks cells that are not in the traditional dp array // every (anti) diagonal is calculated simultaneously using Jewel vectors // note: each vector goes from bottom-left to top-right, ending at the first row in the // DP matrix // similar to levenshtein_simd_k, but without alternating anti-diagonals // note that the first row, which should be all zeros for searching, is not saved in the // Jewel vectors, for space concerns // therefore, starting at i = 1 does not include the first diagonal that only contains // a zero from the initial row of zeros // when left shift are required, then zeros must be shifted in // if anchored = true, then the number of gaps times the gap cost plus the starting gap // cost must be shifted in // for speed, transpositions are done by directly blending using the mask, without calculating // the minimum cost compared to the other edit operations let mut i = 1; let res = iter::from_fn(move || { while i < len { // shift the haystack window haystack_window.shift_left_1_mut(); if haystack_idx < haystack_len { haystack_window.insert_last_0(*haystack.get_unchecked(haystack_idx) as u32); haystack_idx += 1; } <$jewel>::cmpeq(&needle_window, &haystack_window, &mut match_mask1); <$jewel>::andnot(&match_mask1, &mismatch_cost, &mut match_mask_cost); // match/mismatch <$jewel>::shift_left_1(&dp1, &mut sub); if anchored && i > 1 { // dp1 is 2 diagonals behind the current i // must be capped at k to prevent overflow when inserting sub.insert_last_0(cmp::min((i as u32 - 1) * (costs.gap_cost as u32) + costs.start_gap_cost as u32, k + 1)); } sub.adds_mut(&match_mask_cost); <$jewel>::shift_left_1(&length1, &mut sub_length); // zeros are shifted in sub_length.add_mut(&ones); // gap in needle <$jewel>::adds(&dp2, &start_gap_cost, &mut needle_gap); <$jewel>::double_min_length(&needle_gap, &mut needle_gap_dp, &length2, &mut needle_gap_length); needle_gap_dp.adds_mut(&gap_cost); needle_gap_length.add_mut(&ones); // gap in haystack <$jewel>::adds(&dp2, &start_gap_cost, &mut haystack_gap); <$jewel>::double_min_length(&haystack_gap, &mut haystack_gap_dp, &length2, &mut haystack_gap_length); haystack_gap_dp.shift_left_1_mut(); // zeros are shifted in if anchored { // dp2 is one diagonal behind the current i haystack_gap_dp.insert_last_0(cmp::min((i as u32) * (costs.gap_cost as u32) + costs.start_gap_cost as u32, k + 1)); }else{ haystack_gap_dp.insert_last_0(costs.start_gap_cost as u32); } haystack_gap_dp.adds_mut(&gap_cost); haystack_gap_length.shift_left_1_mut(); // zeros are shifted in if allow_transpose { <$jewel>::shift_left_1(&match_mask0, &mut transpose); // reuse transpose, shift in zeros transpose.and_mut(&match_mask0); // ensure that current matches are excluded <$jewel>::andnot(&match_mask1, &transpose, &mut match_mask0); // reuse match_mask0 to represent transpose mask dp0.shift_left_2_mut(); if anchored && i > 3 { // dp0 is four diagonals behind the current i dp0.insert_last_1(cmp::min((i as u32 - 3) * (costs.gap_cost as u32) + costs.start_gap_cost as u32, k + 1)); } length0.shift_left_2_mut(); <$jewel>::adds(&dp0, &transpose_cost, &mut transpose); <$jewel>::add(&length0, &twos, &mut transpose_length); } <$jewel>::triple_min_length(&sub, &needle_gap_dp, &haystack_gap_dp, &sub_length, &needle_gap_length, &haystack_gap_length, &mut dp0, &mut length0); if allow_transpose { // blend using transpose mask dp0.blendv_mut(&transpose, &match_mask0); length0.blendv_mut(&transpose_length, &match_mask0); mem::swap(&mut match_mask0, &mut match_mask1); } mem::swap(&mut dp0, &mut dp_temp); mem::swap(&mut dp_temp, &mut dp1); mem::swap(&mut dp1, &mut dp2); mem::swap(&mut length0, &mut length_temp); mem::swap(&mut length_temp, &mut length1); mem::swap(&mut length1, &mut length2); i += 1; if i >= needle_len { let final_res = dp2.slow_extract(final_idx); let final_length = length2.slow_extract(final_idx) as usize; if final_res <= curr_k { let end_idx = i - needle_len; match search_type { // if we want the best, then we can shrink the k threshold SearchType::Best => curr_k = final_res, _ => () } return Some((Match{start: end_idx - final_length, end: end_idx, k: final_res}, curr_k)); } } } None }); if search_type == SearchType::Best { // estimate the number of Matches let mut res_vec = Vec::with_capacity((len - needle_len) / needle_len); for m in res { match res_vec.len() { 0 => res_vec.push(m.0), _ => { let last = res_vec.last_mut().unwrap(); // replace previous if fully overlapping if m.0.start <= last.start { *last = m.0; }else{ res_vec.push(m.0); } } } curr_k = m.1; } // only retain matches with the lowest k return Box::new(res_vec.into_iter().filter(move |m| m.k == curr_k)); } Box::new(res.map(|m| m.0)) } }; } // duplicate functions for each Jewel vector type #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_avx_1x32x8, Avx1x32x8, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_avx_2x32x8, Avx2x32x8, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_avx_4x32x8, Avx4x32x8, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_avx_8x32x8, Avx8x32x8, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_avx_nx16x16, AvxNx16x16, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_avx_nx8x32, AvxNx8x32, "avx2"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_sse_1x16x8, Sse1x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_sse_2x16x8, Sse2x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_sse_4x16x8, Sse4x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_sse_8x16x8, Sse8x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_sse_16x16x8, Sse16x16x8, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_sse_nx8x16, SseNx8x16, "sse4.1"); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] create_levenshtein_search_simd_core!(levenshtein_search_simd_core_sse_nx4x32, SseNx4x32, "sse4.1"); /// Returns an iterator over best `Match`s by searching through the text `haystack` for the /// pattern `needle` using SIMD acceleration. /// /// The best matches are the matches with the lowest Levenshtein distance. /// If multiple best matches end at the same position or fully overlap, then the longest match is chosen. /// If `needle` is empty, then no `Match`es are returned. /// Each returned `Match` requires at least half or more bytes of the `needle` to match /// somewhere in the `haystack`. /// Internally, this will call `levenshtein_search_simd`. /// If AVX2 or SSE4.1 is not supported, then this will automatically fall back to a scalar alternative. /// /// # Arguments /// * `needle` - pattern string (slice) /// * `haystack` - text string (slice) /// /// # Example /// ``` /// # use triple_accel::*; /// let matches: Vec = levenshtein_search(b"abc", b" abd").collect(); /// /// assert!(matches == vec![Match{start: 2, end: 5, k: 1}]); /// ``` pub fn levenshtein_search<'a>(needle: &'a [u8], haystack: &'a [u8]) -> Box + 'a> { levenshtein_search_simd(needle, haystack) } triple_accel-0.4.0/src/lib.rs000064400000000000000000000214320000000000000141350ustar 00000000000000//! # triple_accel //! //! Rust edit distance routines accelerated using SIMD. Supports fast Hamming, Levenshtein, //! restricted Damerau-Levenshtein, etc. distance calculations and string search. //! //! Although vectorized SIMD code allows for up to 20-30x speedups over their scalar counterparts, //! the difficulty of handling platform-dependent SIMD code makes SIMD routines less attractive. //! The goal of this library is to provide an easy-to-use abstraction over SIMD edit distance routines //! that fall back to scalar routines if the target CPU architecture is not supported. //! Additionally, all limitations and tradeoffs of the edit distance routines should be provided upfront //! so the user knows exactly what to expect. //! Finally, this library should lead to performance boosts on both short and longer strings, so it //! can be used for a variety of tasks, from bioinformatics to natural language processing. //! `triple_accel` is very lightweight: it only has dependencies on other crates for benchmarking. //! It can be built on machines without CPUs that have AVX2 or SSE4.1 support. It can also run on //! machines without SIMD support by automatically using scalar alternatives. //! //! ## Features //! //! This library provides routines for both searching for some needle string in a haystack string //! and calculating the edit distance between two strings. Hamming distance (mismatches only), //! Levenshtein distance (mismatches + gaps), and restricted Damerau-Levenshtein distance //! (transpositions + mismatches + gaps) are supported, along with arbitrary edit costs. This //! library provides a simple interface, in addition to powerful lower-level control over the edit //! distance calculations. //! //! At runtime, the implementation for a certain algorithm is selected based on CPU support, going //! down the list: //! //! 1. Vectorized implementation with 256-bit AVX vectors, if AVX2 is supported. //! 2. Vectorized implementation with 128-bit SSE vectors, if SSE4.1 is supported. //! 3. Scalar implementation. //! //! Currently, vectorized SIMD implementations are only available for x86 or x86-64 CPUs. However, //! after compiling this library on a machine that supports those SIMD intrinsics, the library can //! be used on other machines. //! Additionally, the internal data structure for storing vectors and the bit width of the values //! in the vectors are selected at runtime for maximum efficiency and accuracy, given the lengths //! of the input strings. //! //! ## Limitations //! //! Due to the use of SIMD intrinsics, only binary strings that are represented with `u8` bytes //! are supported. Unicode strings are not currently supported. //! //! ## Notation //! //! Quick notation notes that will often appear in the code/documentation: //! //! * `k` - the number of edits that are allowed //! * `a` and `b` - any two strings; this is usually used for edit distance routines //! * `needle` and `haystack` - any two strings; we want to search for where needle appears in //! haystack //! //! ## Examples //! //! Calculating the Hamming distance (number of mismatches) between two strings is extremely simple: //! ``` //! use triple_accel::*; //! //! let a = b"abcd"; //! let b = b"abcc"; //! //! let dist = hamming(a, b); //! assert!(dist == 1); //! ``` //! By default, SIMD will be used if possible. Similarly, we can easily calculate the Levenshtein //! distance (character mismatches and gaps all have a cost of 1) between two strings with the //! following code: //! ``` //! use triple_accel::*; //! //! let a = b"abc"; //! let b = b"abcd"; //! //! let dist = levenshtein_exp(a, b); //! assert!(dist == 1); //! ``` //! This uses exponential search to estimate the number of edits between `a` and `b`, which makes it //! more efficient than the alternative `levenshtein` function when the number of edits between `a` //! and `b` is low. //! //! In addition to edit distance routines, `triple_accel` also provides search routines. These //! routines return an iterator over matches that indicate where the `needle` string matches the `haystack` //! string. `triple_accel` will attempt to maximize the length of matches that end at the same position and //! remove shorter matches when some matches fully overlap. //! ``` //! use triple_accel::*; //! //! let needle = b"helllo"; //! let haystack = b"hello world"; //! //! let matches: Vec = levenshtein_search(needle, haystack).collect(); //! // note: start index is inclusive, end index is exclusive! //! assert!(matches == vec![Match{start: 0, end: 5, k: 1}]); //! ``` //! Sometimes, it is necessary to use the slightly lower level, but also more powerful routines //! that `triple_accel` provides. For example, it is possible to allow transpositions (character swaps) //! that have a cost of 1, in addition to mismatches and gaps: //! ``` //! use triple_accel::levenshtein::*; //! //! let a = b"abcd"; //! let b = b"abdc"; //! let k = 2; // upper bound on allowed cost //! let trace_on = false; // return edit traceback? //! //! let dist = levenshtein_simd_k_with_opts(a, b, k, trace_on, RDAMERAU_COSTS); //! // note: dist may be None if a and b do not match within a cost of k //! assert!(dist.unwrap().0 == 1); //! ``` //! Don't let the name of the function fool you! `levenshtein_simd_k_with_opts` will still fall back to //! the scalar implementation if AVX2 or SSE4.1 support is not available. It just prefers to use SIMD //! where possible. //! //! For most common cases, the re-exported functions are enough, and the low level functions do not //! have to be used directly. use std::*; mod jewel; pub mod hamming; pub mod levenshtein; // re-export common functions pub use hamming::{hamming, hamming_search}; pub use levenshtein::{levenshtein, rdamerau, levenshtein_exp, rdamerau_exp, levenshtein_search}; // some shared utility stuff below /// A struct that describes a single matching location. /// /// This is usually returned as part of searching routines. #[derive(Debug, PartialEq)] pub struct Match { /// The start index of the match (inclusive). pub start: usize, /// The end index of the match (exclusive). pub end: usize, /// Number of edits for the match. pub k: u32 } /// An enum describing possible edit operations. /// /// This is usually returned as part of the traceback for edit distance routines. #[derive(Debug, PartialEq)] pub enum EditType { Match, Mismatch, AGap, BGap, Transpose } /// A struct representing a sequence of edits of the same type. /// /// This is returned in the run-length encoded traceback of edit distance routines. #[derive(Debug, PartialEq)] pub struct Edit { /// The type of edit operation. pub edit: EditType, /// The number of consecutive edit operations of the same type. pub count: usize } /// An enum representing whether to return all matches or just the best matches. /// /// This is used as an argument for searching routines. #[derive(Debug, PartialEq, Copy, Clone)] pub enum SearchType { All, Best } /// This creates a vector with the alignment and padding for `u128` values, and /// then convert it to a vector of `u8` values that is returned. /// /// This is possible because u8 has looser alignment requirements than `u128`. /// This vector can be easily converted back to `u128` or `u64` later, for Hamming /// distance routines. /// The returned vector can be edited by copying `u8` values into it. /// However, do not do any operation (like `push`) that may cause the the vector to be /// reallocated. /// /// # Arguments /// * `len` - the length of the resulting array of u8 values /// /// # Example /// ``` /// # use triple_accel::*; /// let s = alloc_str(10); /// /// assert!(s.len() == 10); /// ``` #[inline] pub fn alloc_str(len: usize) -> Vec { let words_len = (len >> 4) + (if (len & 15) > 0 {1} else {0}); let words = vec![0u128; words_len]; let mut words = mem::ManuallyDrop::new(words); unsafe { Vec::from_raw_parts(words.as_mut_ptr() as *mut u8, len, words_len << 4) } } /// Directly copy from the a source `u8` slice to a destination `u8` slice. /// /// Can be used to copy string data after allocating a vector using `alloc_str`. /// /// # Arguments /// * `dest` - the destination slice /// * `src` - the source slice /// /// # Panics /// * If the length of `src` is greater than the length of `dest`. /// /// # Example /// ``` /// # use triple_accel::*; /// let mut a = vec![0u8; 5]; /// let b = vec![1u8, 2u8, 3u8, 4u8]; /// /// fill_str(&mut a, &b); /// /// assert!(a == vec![1u8, 2u8, 3u8, 4u8, 0u8]); /// ``` #[inline] pub fn fill_str(dest: &mut [u8], src: &[u8]) { assert!(dest.len() >= src.len()); unsafe { ptr::copy_nonoverlapping(src.as_ptr(), dest.as_mut_ptr(), src.len()); } } fn check_no_null_bytes(s: &[u8]) { for i in 0..s.len() { if s[i] == 0u8 { panic!("No zero/null bytes allowed in the string!"); } } } triple_accel-0.4.0/tests/basic_tests.rs000064400000000000000000000656300000000000000162550ustar 00000000000000use triple_accel::*; use triple_accel::hamming::*; use triple_accel::levenshtein::*; #[test] fn test_basic_hamming_naive() { let a1 = b"abc"; let b1 = b"abd"; let dist = hamming_naive(a1, b1); assert!(dist == 1); let a2 = b""; let b2 = b""; let dist = hamming_naive(a2, b2); assert!(dist == 0); } #[test] fn test_basic_hamming_search_naive() { let a1 = b"abc"; let b1 = b" abc abb"; let mut res: Vec = hamming_search_naive_with_opts(a1, b1, 1, SearchType::All).collect(); assert!(res == vec![Match{start: 2, end: 5, k: 0}, Match{start: 7, end: 10, k: 1}]); let a2 = b"abc"; let b2 = b" abc abb"; res = hamming_search_naive(a2, b2).collect(); assert!(res == vec![Match{start: 2, end: 5, k: 0}]); } #[test] fn test_basic_hamming_search_simd() { let a1 = b"abc"; let b1 = b" abc abb aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; let mut res: Vec = hamming_search_simd_with_opts(a1, b1, 1, SearchType::All).collect(); assert!(res == vec![Match{start: 2, end: 5, k: 0}, Match{start: 7, end: 10, k: 1}]); let a2 = b"abc"; let b2 = b" abc abb aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; res = hamming_search_simd(a2, b2).collect(); assert!(res == vec![Match{start: 2, end: 5, k: 0}]); } #[test] fn test_basic_hamming_words_64() { let a_str = b"abc"; let b_str = b"abd"; let mut a = alloc_str(a_str.len()); fill_str(&mut a, a_str); let mut b = alloc_str(b_str.len()); fill_str(&mut b, b_str); let dist = hamming_words_64(&a, &b); assert!(dist == 1); } #[test] fn test_basic_hamming_words_128() { let a_str = b"abc"; let b_str = b"abd"; let mut a = alloc_str(a_str.len()); fill_str(&mut a, a_str); let mut b = alloc_str(b_str.len()); fill_str(&mut b, b_str); let dist = hamming_words_128(&a, &b); assert!(dist == 1); } #[test] fn test_basic_hamming_simd_movemask() { let a1 = b"abcaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; let b1 = b"abdaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; let dist = hamming_simd_movemask(a1, b1); assert!(dist == 1); let a2 = b""; let b2 = b""; let dist = hamming_simd_movemask(a2, b2); assert!(dist == 0); } #[test] fn test_basic_hamming_simd_parallel() { let a1 = b"abcaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; let b1 = b"abdaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; let dist = hamming_simd_parallel(a1, b1); assert!(dist == 1); let a2 = b""; let b2 = b""; let dist = hamming_simd_parallel(a2, b2); assert!(dist == 0); } #[test] fn test_basic_levenshtein_naive() { let a1 = b"abcde"; let b1 = b" ab cde"; let mut res = levenshtein_naive(a1, b1); assert!(res == 2); let a2 = b"abcde"; let b2 = b""; res = levenshtein_naive(a2, b2); assert!(res == 5); let a3 = b"abcde"; let b3 = b"abcdee"; res = levenshtein_naive(a3, b3); assert!(res == 1); let a4 = b"abcde"; let b4 = b"acde"; res = levenshtein_naive(a4, b4); assert!(res == 1); let a5 = b"abcde"; let b5 = b"abbde"; res = levenshtein_naive(a5, b5); assert!(res == 1); let a6 = b"abcde"; let b6 = b"acbde"; res = levenshtein_naive_with_opts(a6, b6, false, EditCosts::new(1, 1, 0, Some(1))).0; assert!(res == 1); let a7 = b"ab"; let b7 = b"ba"; res = levenshtein_naive_with_opts(a7, b7, false, EditCosts::new(1, 1, 0, Some(1))).0; assert!(res == 1); let a8 = b"abc"; let b8 = b"aac"; res = levenshtein_naive_with_opts(a8, b8, false, EditCosts::new(2, 3, 0, None)).0; assert!(res == 2); let a9 = b"abc"; let b9 = b"aac"; res = levenshtein_naive_with_opts(a9, b9, false, EditCosts::new(3, 1, 0, None)).0; assert!(res == 2); let a10 = b"abc"; let b10 = b"ac"; res = levenshtein_naive_with_opts(a10, b10, false, EditCosts::new(1, 1, 2, None)).0; assert!(res == 3); let a11 = b"acde"; let b11 = b"abce"; res = levenshtein_naive_with_opts(a11, b11, false, EditCosts::new(2, 1, 2, None)).0; assert!(res == 4); let a12 = b"abcde"; let b12 = b"abe"; res = levenshtein_naive_with_opts(a12, b12, false, EditCosts::new(1, 1, 2, None)).0; assert!(res == 4); } #[test] fn test_trace_on_levenshtein_naive() { let a1 = b"abcde"; let b1 = b" ab cde"; let mut res = levenshtein_naive_with_opts(a1, b1, true, LEVENSHTEIN_COSTS); assert!(res.0 == 2); assert!(res.1.unwrap() == vec![Edit{edit: EditType::AGap, count: 1}, Edit{edit: EditType::Match, count: 2}, Edit{edit: EditType::AGap, count: 1}, Edit{edit: EditType::Match, count: 3}]); let a2 = b"abcde"; let b2 = b""; res = levenshtein_naive_with_opts(a2, b2, true, LEVENSHTEIN_COSTS); assert!(res.0 == 5); assert!(res.1.unwrap() == vec![Edit{edit: EditType::BGap, count: 5}]); let a3 = b"abcde"; let b3 = b"abcce"; res = levenshtein_naive_with_opts(a3, b3, true, LEVENSHTEIN_COSTS); assert!(res.0 == 1); assert!(res.1.unwrap() == vec![Edit{edit: EditType::Match, count: 3}, Edit{edit: EditType::Mismatch, count: 1}, Edit{edit: EditType::Match, count: 1}]); let a4 = b"abcde"; let b4 = b"acbde"; res = levenshtein_naive_with_opts(a4, b4, true, EditCosts::new(1, 1, 0, Some(1))); assert!(res.0 == 1); assert!(res.1.unwrap() == vec![Edit{edit: EditType::Match, count: 1}, Edit{edit: EditType::Transpose, count: 1}, Edit{edit: EditType::Match, count: 2}]); } #[test] fn test_basic_levenshtein() { let a1 = b"abcde"; let b1 = b" ab cde"; let mut res = levenshtein(a1, b1); assert!(res == 2); let a2 = b"abcde"; let b2 = b""; res = levenshtein(a2, b2); assert!(res == 5); let a3 = b"abcde"; let b3 = b"abcdee"; res = levenshtein(a3, b3); assert!(res == 1); let a4 = b"abcde"; let b4 = b"acde"; res = levenshtein(a4, b4); assert!(res == 1); let a5 = b"abcde"; let b5 = b"abbde"; res = levenshtein(a5, b5); assert!(res == 1); } #[test] fn test_basic_levenshtein_exp() { let a1 = b"abcde"; let b1 = b" ab cde"; let mut res = levenshtein_exp(a1, b1); assert!(res == 2); let a2 = b"abcde"; let b2 = b""; res = levenshtein_exp(a2, b2); assert!(res == 5); let a3 = b"abcde"; let b3 = b"abcdee"; res = levenshtein_exp(a3, b3); assert!(res == 1); let a4 = b"abcde"; let b4 = b"acde"; res = levenshtein_exp(a4, b4); assert!(res == 1); let a5 = b"abcde"; let b5 = b"abbde"; res = levenshtein_exp(a5, b5); assert!(res == 1); } #[test] fn test_basic_rdamerau() { let a1 = b"abcde"; let b1 = b" ab dce"; let mut res = rdamerau(a1, b1); assert!(res == 3); let a2 = b"abcde"; let b2 = b""; res = rdamerau(a2, b2); assert!(res == 5); let a3 = b"abcde"; let b3 = b"bacdee"; res = rdamerau(a3, b3); assert!(res == 2); let a4 = b"abcde"; let b4 = b"acde"; res = rdamerau(a4, b4); assert!(res == 1); let a5 = b"abcde"; let b5 = b"abbde"; res = rdamerau(a5, b5); assert!(res == 1); } #[test] fn test_basic_rdamerau_exp() { let a1 = b"abcde"; let b1 = b" ab dce"; let mut res = rdamerau_exp(a1, b1); assert!(res == 3); let a2 = b"abcde"; let b2 = b""; res = rdamerau_exp(a2, b2); assert!(res == 5); let a3 = b"abcde"; let b3 = b"bacdee"; res = rdamerau_exp(a3, b3); assert!(res == 2); let a4 = b"abcde"; let b4 = b"acde"; res = rdamerau_exp(a4, b4); assert!(res == 1); let a5 = b"abcde"; let b5 = b"abbde"; res = rdamerau_exp(a5, b5); assert!(res == 1); } #[test] fn test_basic_levenshtein_naive_k_with_opts() { let a1 = b"abcde"; let b1 = b" ab cde"; let mut res = levenshtein_naive_k_with_opts(a1, b1, 2, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 2); assert!(res.1.is_none()); let a2 = b"abcde"; let b2 = b""; res = levenshtein_naive_k_with_opts(a2, b2, 10, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 5); assert!(res.1.is_none()); let a3 = b"abcde"; let b3 = b"abcdee"; res = levenshtein_naive_k_with_opts(a3, b3, 2, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a4 = b"abcde"; let b4 = b"acde"; res = levenshtein_naive_k_with_opts(a4, b4, 2, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a5 = b"abcde"; let b5 = b"abbde"; res = levenshtein_naive_k_with_opts(a5, b5, 2, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a6 = b"abcde"; let b6 = b"abbde"; res = levenshtein_naive_k_with_opts(a6, b6, 1, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a7 = b"abcde"; let b7 = b"acbde"; res = levenshtein_naive_k_with_opts(a7, b7, 1, false, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a8 = b"ab"; let b8 = b"ba"; res = levenshtein_naive_k_with_opts(a8, b8, 1, false, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a9 = b"abc"; let b9 = b"aac"; res = levenshtein_naive_k_with_opts(a9, b9, 5, false, EditCosts::new(2, 3, 0, None)).unwrap(); assert!(res.0 == 2); assert!(res.1.is_none()); let a10 = b"abc"; let b10 = b"aac"; res = levenshtein_naive_k_with_opts(a10, b10, 5, false, EditCosts::new(3, 1, 0, None)).unwrap(); assert!(res.0 == 2); assert!(res.1.is_none()); let a11 = b"abc"; let b11 = b"ac"; res = levenshtein_naive_k_with_opts(a11, b11, 5, false, EditCosts::new(1, 1, 2, None)).unwrap(); assert!(res.0 == 3); assert!(res.1.is_none()); let a12 = b"acde"; let b12 = b"abce"; res = levenshtein_naive_k_with_opts(a12, b12, 5, false, EditCosts::new(2, 1, 2, None)).unwrap(); assert!(res.0 == 4); assert!(res.1.is_none()); let a13 = b"abcde"; let b13 = b"abe"; res = levenshtein_naive_k_with_opts(a13, b13, 5, false, EditCosts::new(1, 1, 2, None)).unwrap(); assert!(res.0 == 4); assert!(res.1.is_none()); let a14 = b"abcde"; let b14 = b"hello"; let res1 = levenshtein_naive_k_with_opts(a14, b14, 1, false, RDAMERAU_COSTS); assert!(res1.is_none()); } #[test] fn test_trace_on_levenshtein_naive_k_with_opts() { let a1 = b"abcde"; let b1 = b" ab cde"; let mut res = levenshtein_naive_k_with_opts(a1, b1, 2, true, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 2); assert!(res.1.unwrap() == vec![Edit{edit: EditType::AGap, count: 1}, Edit{edit: EditType::Match, count: 2}, Edit{edit: EditType::AGap, count: 1}, Edit{edit: EditType::Match, count: 3}]); let a2 = b"abcde"; let b2 = b""; res = levenshtein_naive_k_with_opts(a2, b2, 10, true, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 5); assert!(res.1.unwrap() == vec![Edit{edit: EditType::BGap, count: 5}]); let a3 = b"abcde"; let b3 = b"abcce"; res = levenshtein_naive_k_with_opts(a3, b3, 2, true, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.unwrap() == vec![Edit{edit: EditType::Match, count: 3}, Edit{edit: EditType::Mismatch, count: 1}, Edit{edit: EditType::Match, count: 1}]); let a4 = b"abcde"; let b4 = b"acbde"; res = levenshtein_naive_k_with_opts(a4, b4, 2, true, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 1); assert!(res.1.unwrap() == vec![Edit{edit: EditType::Match, count: 1}, Edit{edit: EditType::Transpose, count: 1}, Edit{edit: EditType::Match, count: 2}]); } #[test] fn test_basic_levenshtein_simd_k_with_opts() { let a1 = b"abcde"; let b1 = b" ab cde"; let mut res = levenshtein_simd_k_with_opts(a1, b1, 2, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 2); assert!(res.1.is_none()); let a2 = b"abcde"; let b2 = b""; res = levenshtein_simd_k_with_opts(a2, b2, 30, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 5); assert!(res.1.is_none()); let a3 = b"abcde"; let b3 = b"abcdee"; res = levenshtein_simd_k_with_opts(a3, b3, 20, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a4 = b"abcde"; let b4 = b"acde"; res = levenshtein_simd_k_with_opts(a4, b4, 1, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a5 = b"abcde"; let b5 = b"abbde"; res = levenshtein_simd_k_with_opts(a5, b5, 2, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a6 = b"abcde"; let b6 = b"acbde"; res = levenshtein_simd_k_with_opts(a6, b6, 2, false, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a7 = b"ab"; let b7 = b"ba"; res = levenshtein_simd_k_with_opts(a7, b7, 2, false, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a8 = b"abc"; let b8 = b"aac"; res = levenshtein_simd_k_with_opts(a8, b8, 5, false, EditCosts::new(2, 3, 0, None)).unwrap(); assert!(res.0 == 2); assert!(res.1.is_none()); let a9 = b"abc"; let b9 = b"aac"; res = levenshtein_simd_k_with_opts(a9, b9, 5, false, EditCosts::new(3, 1, 0, None)).unwrap(); assert!(res.0 == 2); assert!(res.1.is_none()); let a10 = b"abc"; let b10 = b"ac"; res = levenshtein_simd_k_with_opts(a10, b10, 5, false, EditCosts::new(1, 1, 2, None)).unwrap(); assert!(res.0 == 3); assert!(res.1.is_none()); let a11 = b"acde"; let b11 = b"abce"; res = levenshtein_simd_k_with_opts(a11, b11, 5, false, EditCosts::new(2, 1, 2, None)).unwrap(); assert!(res.0 == 4); assert!(res.1.is_none()); let a12 = b"abcde"; let b12 = b"abe"; res = levenshtein_simd_k_with_opts(a12, b12, 5, false, EditCosts::new(1, 1, 2, None)).unwrap(); assert!(res.0 == 4); assert!(res.1.is_none()); let a13 = b"\0"; let b13 = b""; res = levenshtein_simd_k_with_opts(a13, b13, 2, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a14 = b"ab\0de"; let b14 = b"a\0bde"; res = levenshtein_simd_k_with_opts(a14, b14, 2, false, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a15 = b"\0b"; let b15 = b"b\0"; res = levenshtein_simd_k_with_opts(a15, b15, 2, false, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a16 = b"\0"; let b16 = b"\0\0"; res = levenshtein_simd_k_with_opts(a16, b16, 2, false, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a17 = b"\0"; let b17 = b"\0"; res = levenshtein_simd_k_with_opts(a17, b17, 2, false, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 0); assert!(res.1.is_none()); let a18 = b"\0\0b\0"; let b18 = b"\0b\0\0"; res = levenshtein_simd_k_with_opts(a18, b18, 2, false, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 1); assert!(res.1.is_none()); let a19 = b"abcde"; let b19 = b"hello"; let res1 = levenshtein_simd_k_with_opts(a19, b19, 1, false, RDAMERAU_COSTS); assert!(res1.is_none()); } #[test] fn test_trace_on_levenshtein_simd_k_with_opts() { let a1 = b"abcde"; let b1 = b" ab cde"; let mut res = levenshtein_simd_k_with_opts(a1, b1, 30, true, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 2); assert!(res.1.unwrap() == vec![Edit{edit: EditType::AGap, count: 1}, Edit{edit: EditType::Match, count: 2}, Edit{edit: EditType::AGap, count: 1}, Edit{edit: EditType::Match, count: 3}]); let a2 = b"abcde"; let b2 = b""; res = levenshtein_simd_k_with_opts(a2, b2, 5, true, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 5); assert!(res.1.unwrap() == vec![Edit{edit: EditType::BGap, count: 5}]); let a3 = b"abcde"; let b3 = b"abcce"; res = levenshtein_simd_k_with_opts(a3, b3, 1, true, LEVENSHTEIN_COSTS).unwrap(); assert!(res.0 == 1); assert!(res.1.unwrap() == vec![Edit{edit: EditType::Match, count: 3}, Edit{edit: EditType::Mismatch, count: 1}, Edit{edit: EditType::Match, count: 1}]); let a4 = b"abcde"; let b4 = b"acbde"; res = levenshtein_simd_k_with_opts(a4, b4, 2, true, EditCosts::new(1, 1, 0, Some(1))).unwrap(); assert!(res.0 == 1); assert!(res.1.unwrap() == vec![Edit{edit: EditType::Match, count: 1}, Edit{edit: EditType::Transpose, count: 1}, Edit{edit: EditType::Match, count: 2}]); } #[test] fn test_basic_levenshtein_search_naive() { let a1 = b"bcc"; let b1 = b"abcde"; let k1 = 1; let mut res: Vec = levenshtein_search_naive_with_opts(a1, b1, k1, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![Match{start: 1, end: 3, k: 1}, Match{start: 1, end: 4, k: 1}]); let a2 = b""; let b2 = b""; let k2 = 1; res = levenshtein_search_naive_with_opts(a2, b2, k2, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![]); let a3 = b"tast"; let b3 = b"testing 123 tating!"; let k3 = 1; res = levenshtein_search_naive_with_opts(a3, b3, k3, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![Match{start: 0, end: 4, k: 1}, Match{start: 12, end: 15, k: 1}]); let a4 = b"tst"; let b4 = b"testing 123 tasting!"; let k4 = 1; res = levenshtein_search_naive_with_opts(a4, b4, k4, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![Match{start: 0, end: 4, k: 1}, Match{start: 12, end: 16, k: 1}]); let a5 = b"tst"; let b5 = b"testing 123 tasting!"; res = levenshtein_search_naive(a5, b5).collect(); assert!(res == vec![Match{start: 0, end: 4, k: 1}, Match{start: 12, end: 16, k: 1}]); let a6 = b"ab"; let b6 = b"ba"; let k6 = 1; res = levenshtein_search_naive_with_opts(a6, b6, k6, SearchType::All, EditCosts::new(1, 1, 0, Some(1)), false).collect(); assert!(res == vec![Match{start: 0, end: 1, k: 1}, Match{start: 0, end: 2, k: 1}]); let a7 = b"test"; let b7 = b"...tseting!"; let k7 = 1; res = levenshtein_search_naive_with_opts(a7, b7, k7, SearchType::All, EditCosts::new(1, 1, 0, Some(1)), false).collect(); assert!(res == vec![Match{start: 3, end: 7, k: 1}]); let a8 = b"test"; let b8 = b"...tssting!"; let k8 = 2; res = levenshtein_search_naive_with_opts(a8, b8, k8, SearchType::All, EditCosts::new(3, 1, 0, None), false).collect(); assert!(res == vec![Match{start: 3, end: 5, k: 2}, Match{start: 3, end: 7, k: 2}]); let a9 = b"tst"; let b9 = b"testing 123 tasting"; let k9 = 1; let res1 = levenshtein_search_naive_with_opts(a9, b9, k9, SearchType::All, LEVENSHTEIN_COSTS, false).next().unwrap(); assert!(res1 == Match{start: 0, end: 4, k: 1}); let a10 = b"test"; let b10 = b" testing 123 tasting"; let k10 = 1; res = levenshtein_search_naive_with_opts(a10, b10, k10, SearchType::All, LEVENSHTEIN_COSTS, true).collect(); assert!(res == vec![Match{start: 1, end: 5, k: 1}]); let a11 = b"test"; let b11 = b" etsting 123 tasting"; let k11 = 2; res = levenshtein_search_naive_with_opts(a11, b11, k11, SearchType::All, RDAMERAU_COSTS, true).collect(); assert!(res == vec![Match{start: 0, end: 3, k: 2}, Match{start: 0, end: 4, k: 2}, Match{start: 1, end: 5, k: 2}]); let a12 = b"test"; let b12 = b"etsting"; let k12 = 1; res = levenshtein_search_naive_with_opts(a12, b12, k12, SearchType::All, RDAMERAU_COSTS, true).collect(); assert!(res == vec![Match{start: 0, end: 4, k: 1}]); let a13 = b"test"; let b13 = b"est"; let k13 = 3; res = levenshtein_search_naive_with_opts(a13, b13, k13, SearchType::All, EditCosts::new(1, 1, 2, None), true).collect(); assert!(res == vec![Match{start: 0, end: 3, k: 3}]); let a14 = b"testing"; let b14 = b" teing"; let k14 = 4; res = levenshtein_search_naive_with_opts(a14, b14, k14, SearchType::All, EditCosts::new(1, 1, 2, None), false).collect(); assert!(res == vec![Match{start: 1, end: 8, k: 4}]); let a15 = b"testing"; let b15 = b" teing"; let k15 = 4; res = levenshtein_search_naive_with_opts(a15, b15, k15, SearchType::All, EditCosts::new(2, 1, 2, None), false).collect(); assert!(res == vec![Match{start: 3, end: 8, k: 4}]); let a16 = b"abc"; let b16 = b""; let k16 = 5; res = levenshtein_search_naive_with_opts(a16, b16, k16, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![Match{start: 0, end: 0, k: 3}]); let a17 = b""; let b17 = b"abc"; let k17 = 2; res = levenshtein_search_naive_with_opts(a17, b17, k17, SearchType::All, LEVENSHTEIN_COSTS, true).collect(); assert!(res == vec![Match{start: 0, end: 0, k: 0}, Match{start: 0, end: 1, k: 1}, Match{start: 0, end: 2, k: 2}]); } #[test] fn test_basic_levenshtein_search_simd() { let a1 = b"bcc"; let b1 = b"abcde"; let k1 = 1; let mut res: Vec = levenshtein_search_simd_with_opts(a1, b1, k1, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![Match{start: 1, end: 3, k: 1}, Match{start: 1, end: 4, k: 1}]); let a2 = b""; let b2 = b""; let k2 = 1; res = levenshtein_search_simd_with_opts(a2, b2, k2, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![]); let a3 = b"tast"; let b3 = b"testing 123 tating!"; let k3 = 1; res = levenshtein_search_simd_with_opts(a3, b3, k3, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![Match{start: 0, end: 4, k: 1}, Match{start: 12, end: 15, k: 1}]); let a4 = b"tst"; let b4 = b"testing 123 tasting!"; let k4 = 1; res = levenshtein_search_simd_with_opts(a4, b4, k4, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![Match{start: 0, end: 4, k: 1}, Match{start: 12, end: 16, k: 1}]); let a5 = b"tst"; let b5 = b"testing 123 tasting!"; res = levenshtein_search_simd(a5, b5).collect(); assert!(res == vec![Match{start: 0, end: 4, k: 1}, Match{start: 12, end: 16, k: 1}]); let a6 = b"ab"; let b6 = b"ba"; let k6 = 1; res = levenshtein_search_simd_with_opts(a6, b6, k6, SearchType::All, EditCosts::new(1, 1, 0, Some(1)), false).collect(); assert!(res == vec![Match{start: 0, end: 1, k: 1}, Match{start: 0, end: 2, k: 1}]); let a7 = b"test"; let b7 = b"...tseting!"; let k7 = 1; res = levenshtein_search_simd_with_opts(a7, b7, k7, SearchType::All, EditCosts::new(1, 1, 0, Some(1)), false).collect(); assert!(res == vec![Match{start: 3, end: 7, k: 1}]); let a8 = b"test"; let b8 = b"...tssting!"; let k8 = 2; res = levenshtein_search_simd_with_opts(a8, b8, k8, SearchType::All, EditCosts::new(3, 1, 0, None), false).collect(); assert!(res == vec![Match{start: 3, end: 5, k: 2}, Match{start: 3, end: 7, k: 2}]); let a9 = b"tst"; let b9 = b"testing 123 tasting"; let k9 = 1; let res1 = levenshtein_search_simd_with_opts(a9, b9, k9, SearchType::All, LEVENSHTEIN_COSTS, false).next().unwrap(); assert!(res1 == Match{start: 0, end: 4, k: 1}); let a10 = b"test"; let b10 = b" testing 123 tasting"; let k10 = 1; res = levenshtein_search_simd_with_opts(a10, b10, k10, SearchType::All, LEVENSHTEIN_COSTS, true).collect(); assert!(res == vec![Match{start: 1, end: 5, k: 1}]); let a11 = b"test"; let b11 = b" etsting 123 tasting"; let k11 = 2; res = levenshtein_search_simd_with_opts(a11, b11, k11, SearchType::All, RDAMERAU_COSTS, true).collect(); assert!(res == vec![Match{start: 0, end: 3, k: 2}, Match{start: 0, end: 4, k: 2}, Match{start: 1, end: 5, k: 2}]); let a12 = b"test"; let b12 = b"etsting"; let k12 = 1; res = levenshtein_search_simd_with_opts(a12, b12, k12, SearchType::All, RDAMERAU_COSTS, true).collect(); assert!(res == vec![Match{start: 0, end: 4, k: 1}]); let a13 = b"test"; let b13 = b"est"; let k13 = 3; res = levenshtein_search_simd_with_opts(a13, b13, k13, SearchType::All, EditCosts::new(1, 1, 2, None), true).collect(); assert!(res == vec![Match{start: 0, end: 3, k: 3}]); let a14 = b"testing"; let b14 = b" teing"; let k14 = 4; res = levenshtein_search_simd_with_opts(a14, b14, k14, SearchType::All, EditCosts::new(1, 1, 2, None), false).collect(); assert!(res == vec![Match{start: 1, end: 8, k: 4}]); let a15 = b"testing"; let b15 = b" teing"; let k15 = 4; res = levenshtein_search_simd_with_opts(a15, b15, k15, SearchType::All, EditCosts::new(2, 1, 2, None), false).collect(); assert!(res == vec![Match{start: 3, end: 8, k: 4}]); let a16 = b"\0b"; let b16 = b"b\0"; let k16 = 1; res = levenshtein_search_simd_with_opts(a16, b16, k16, SearchType::All, RDAMERAU_COSTS, true).collect(); assert!(res == vec![Match{start: 0, end: 1, k: 1}, Match{start: 0, end: 2, k: 1}]); let a17 = b"\0\0"; let b17 = b"\0\0"; let k17 = 0; res = levenshtein_search_simd_with_opts(a17, b17, k17, SearchType::All, RDAMERAU_COSTS, true).collect(); assert!(res == vec![Match{start: 0, end: 2, k: 0}]); let a18 = b"testing"; let b18 = b" \0esting"; let k18 = 1; res = levenshtein_search_simd_with_opts(a18, b18, k18, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![Match{start: 3, end: 10, k: 1}]); let a19 = b"\0\0\0"; let b19 = b"\0\0"; let k19 = 1; res = levenshtein_search_simd_with_opts(a19, b19, k19, SearchType::All, LEVENSHTEIN_COSTS, true).collect(); assert!(res == vec![Match{start: 0, end: 2, k: 1}]); let a20 = b"\0\0"; let b20 = b" \0\0"; let k20 = 0; res = levenshtein_search_simd_with_opts(a20, b20, k20, SearchType::All, RDAMERAU_COSTS, false).collect(); assert!(res == vec![Match{start: 3, end: 5, k: 0}]); let a21 = b"abc"; let b21 = b""; let k21 = 5; res = levenshtein_search_simd_with_opts(a21, b21, k21, SearchType::All, LEVENSHTEIN_COSTS, false).collect(); assert!(res == vec![Match{start: 0, end: 0, k: 3}]); let a22 = b""; let b22 = b"abc"; let k22 = 2; res = levenshtein_search_simd_with_opts(a22, b22, k22, SearchType::All, LEVENSHTEIN_COSTS, true).collect(); assert!(res == vec![Match{start: 0, end: 0, k: 0}, Match{start: 0, end: 1, k: 1}, Match{start: 0, end: 2, k: 2}]); }