fearless_simd-0.3.0/.cargo_vcs_info.json0000644000000001530000000000100136530ustar { "git": { "sha1": "0a3ac74ae62b48a0bcbf6cdbd8c496521a20907c" }, "path_in_vcs": "fearless_simd" }fearless_simd-0.3.0/Cargo.lock0000644000000011040000000000100116230ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 4 [[package]] name = "bytemuck" version = "1.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c76a5792e44e4abe34d3abf15636779261d45a7450612059293d1d2cfc63422" [[package]] name = "fearless_simd" version = "0.3.0" dependencies = [ "bytemuck", "libm", ] [[package]] name = "libm" version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9fbbcab51052fe104eb5e5d351cf728d30a5be1fe14d9be8a3b097481fb97de" fearless_simd-0.3.0/Cargo.toml0000644000000047050000000000100116600ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2024" rust-version = "1.86" name = "fearless_simd" version = "0.3.0" authors = ["Raph Levien "] build = false autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "Safer and easier SIMD" readme = "README.md" keywords = ["simd"] categories = ["hardware-support"] license = "Apache-2.0 OR MIT" repository = "https://github.com/linebender/fearless_simd" resolver = "2" [package.metadata.docs.rs] all-features = true [features] default = ["std"] libm = ["dep:libm"] safe_wrappers = [] std = [] [lib] name = "fearless_simd" path = "src/lib.rs" [[example]] name = "play" path = "examples/play.rs" [[example]] name = "sigmoid" path = "examples/sigmoid.rs" [[example]] name = "srgb" path = "examples/srgb.rs" [dependencies.bytemuck] version = "1.23.0" [dependencies.libm] version = "0.2.15" optional = true [lints.clippy] allow_attributes_without_reason = "warn" cargo_common_metadata = "warn" cast_possible_truncation = "warn" collection_is_never_read = "warn" dbg_macro = "warn" debug_assert_with_mut_call = "warn" doc_markdown = "warn" fn_to_numeric_cast_any = "warn" infinite_loop = "warn" large_stack_arrays = "warn" mismatching_type_param_order = "warn" missing_assert_message = "warn" missing_fields_in_debug = "warn" negative_feature_names = "warn" redundant_feature_names = "warn" same_functions_in_if_condition = "warn" semicolon_if_nothing_returned = "warn" should_panic_without_expect = "warn" todo = "warn" too_many_arguments = "allow" unseparated_literal_suffix = "warn" use_self = "warn" wildcard_dependencies = "warn" [lints.rust] elided_lifetimes_in_paths = "warn" keyword_idents_2024 = "forbid" missing_debug_implementations = "warn" missing_docs = "warn" non_ascii_idents = "forbid" non_local_definitions = "forbid" trivial_numeric_casts = "warn" unnameable_types = "warn" unreachable_pub = "warn" unsafe_op_in_unsafe_fn = "forbid" unused_import_braces = "warn" unused_lifetimes = "warn" unused_macro_rules = "warn" fearless_simd-0.3.0/Cargo.toml.orig000064400000000000000000000020571046102023000153370ustar 00000000000000[package] name = "fearless_simd" version = "0.3.0" license.workspace = true edition.workspace = true repository.workspace = true rust-version.workspace = true authors = ["Raph Levien "] keywords = ["simd"] categories = ["hardware-support"] description = "Safer and easier SIMD" readme = "README.md" [package.metadata.docs.rs] all-features = true # TODO: Get the right set of targets here. x86 linux, x86-64 linux, arm-macos, wasm # default-target = "x86_64-unknown-linux-gnu" # targets = [] [features] default = ["std"] # Get floating point functions from the standard library (likely using your targets libc). # Also allows using `Level::new` on all platforms, to detect which target features are enabled std = [] # Use floating point implementations from libm libm = ["dep:libm"] # Include safe wrappers for (some) target feature specific intrinsics, # beyond the basic SIMD operations abstracted on all platforms safe_wrappers = [] [lints] workspace = true [dependencies] bytemuck = "1.23.0" libm = { version = "0.2.15", optional = true } fearless_simd-0.3.0/LICENSE-APACHE000064400000000000000000000236751046102023000144050ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS fearless_simd-0.3.0/LICENSE-MIT000064400000000000000000000020371046102023000141020ustar 00000000000000Copyright (c) 2018 Raph Levien Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. fearless_simd-0.3.0/README.md000064400000000000000000000163371046102023000137350ustar 00000000000000
# Fearless SIMD **Safer and easier SIMD** [![Latest published version.](https://img.shields.io/crates/v/fearless_simd.svg)](https://crates.io/crates/fearless_simd) [![Documentation build status.](https://img.shields.io/docsrs/fearless_simd.svg)](https://docs.rs/fearless_simd) [![Apache 2.0 or MIT license.](https://img.shields.io/badge/license-Apache--2.0_OR_MIT-blue.svg)](#license) \ [![Linebender Zulip, #simd channel.](https://img.shields.io/badge/Linebender-%23simd-blue?logo=Zulip)](https://xi.zulipchat.com/#narrow/channel/514230-simd) [![GitHub Actions CI status.](https://img.shields.io/github/actions/workflow/status/linebender/fearless_simd/ci.yml?logo=github&label=CI)](https://github.com/linebender/fearless_simd/actions) [![Dependency staleness status.](https://deps.rs/crate/fearless_simd/latest/status.svg)](https://deps.rs/crate/fearless_simd/)
> [!CAUTION] > Fearless SIMD is in extremely early experimental development. As such, there are no stability > guarantees, APIs are incomplete, and architectures have missing implementations. Fearless SIMD is > being developed in conjunction with the [Vello Sparse > Strips](https://github.com/linebender/vello/) renderer. [libm]: https://crates.io/crates/libm [`f32x4`]: https://docs.rs/fearless_simd/latest/fearless_simd/generated/simd_types/struct.f32x4.html [`Simd`]: https://docs.rs/fearless_simd/0.2.0/fearless_simd/generated/simd_trait/trait.Simd.html [`SimdFrom`]: https://docs.rs/fearless_simd/0.2.0/fearless_simd/traits/trait.SimdFrom.html [SimdBase::from_slice]: https://docs.rs/fearless_simd/0.2.0/fearless_simd/generated/simd_trait/trait.SimdBase.html#tymethod.from_slice [`dispatch`]: https://docs.rs/fearless_simd/0.2.0/fearless_simd/macros/macro.dispatch.html [`Level`]: https://docs.rs/fearless_simd/0.2.0/fearless_simd/enum.Level.html [`Level::new`]: https://docs.rs/fearless_simd/0.2.0/fearless_simd/enum.Level.html#method.new [`std::simd`]: https://doc.rust-lang.org/std/simd/index.html A helper library to make SIMD more friendly. Fearless SIMD exposes safe SIMD with ergonomic multi-versioning in Rust. Fearless SIMD uses "marker values" which serve as proofs of which target features are available on the current CPU. These each implement the [`Simd`] trait, which exposes a core set of SIMD operations which are implemented as efficiently as possible on each target platform. Additionally, there are types for packed vectors of a specific width and element type (such as [`f32x4`]). Fearless SIMD does not currently support vectors of less than 128 bits. These vector types implement some standard arithmetic traits (i.e. they can be added together using `+`, multiplied by a scalar using `*`, among others), which are implemented as efficiently as possible using SIMD instructions. These can be created in a SIMD context using the [`SimdFrom`] trait, or the [`from_slice`][SimdBase::from_slice] associated function. To call a function with the best available target features and get the associated `Simd` implementation, use the [`dispatch!()`] macro: ```rust use fearless_simd::{Level, Simd, dispatch}; #[inline(always)] fn sigmoid(simd: S, x: &[f32], out: &mut [f32]) { /* ... */ } // The stored level, which you should only construct once in your application. let level = Level::new(); dispatch!(level, simd => sigmoid(simd, &[/*...*/], &mut [/*...*/])); ``` A few things to note: 1) `sigmoid` is generic over any `Simd` type. 2) The [`dispatch`] macro is used to invoke the given function with the target features associated with the supplied [`Level`]. 3) The function or closure passed to [`dispatch!()`] should be `#[inline(always)]`. The performance of the SIMD implementation may be poor if that isn't the case. See [the section on inlining for details](#inlining) The first parameter to [`dispatch!()`] is the [`Level`]. If you are writing an application, you should create this once (using [`Level::new`]), and pass it to any function which wants to use SIMD. This type stores which instruction sets are available for the current process, which is used in the macro to dispatch to the most optimal variant of the supplied function for this process. # Inlining Fearless SIMD relies heavily on Rust's inlining support to create functions which have the given target features enabled. As such, most functions which you write when using Fearless SIMD should have the `#[inline(always)]` attribute. # Webassembly WASM SIMD doesn't have feature detection, and so you need to compile two versions of your bundle for WASM, one with SIMD and one without, then select the appropriate one for your user's browser. TODO: Expand on this. ## Credits This crate was inspired by [`pulp`], [`std::simd`], among others in the Rust ecosystem, though makes many decisions differently. It benefited from conversations with Luca Versari, though he is not responsible for any of the mistakes or bad decisions. # Feature Flags The following crate [feature flags](https://doc.rust-lang.org/cargo/reference/features.html#dependency-features) are available: - `std` (enabled by default): Get floating point functions from the standard library (likely using your target's libc). Also allows using [`Level::new`] on all platforms, to detect which target features are enabled. - `libm`: Use floating point implementations from [libm]. - `safe_wrappers`: Include safe wrappers for (some) target feature specific intrinsics, beyond the basic SIMD operations abstracted on all platforms. At least one of `std` and `libm` is required; `std` overrides `libm`. [`pulp`]: https://crates.io/crates/pulp ## Minimum supported Rust Version (MSRV) This version of Fearless SIMD has been verified to compile with **Rust 1.86** and later. Future versions of Fearless SIMD might increase the Rust version requirement. It will not be treated as a breaking change and as such can even happen with small patch releases. ## Community [![Linebender Zulip, #simd channel.](https://img.shields.io/badge/Linebender-%23simd-blue?logo=Zulip)](https://xi.zulipchat.com/#narrow/channel/514230-simd) Discussion of Fearless SIMD development happens in the [Linebender Zulip](https://xi.zulipchat.com/), specifically in [#simd](https://xi.zulipchat.com/#narrow/channel/514230-simd). All public content can be read without logging in. Contributions are welcome by pull request. The [Rust code of conduct] applies. ## License Licensed under either of - Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE) or ) - MIT license ([LICENSE-MIT](LICENSE-MIT) or ) at your option. [Rust Code of Conduct]: https://www.rust-lang.org/policies/code-of-conduct fearless_simd-0.3.0/examples/play.rs000064400000000000000000000026001046102023000155730ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT #![expect( missing_docs, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] use fearless_simd::{Level, Simd, SimdBase, WithSimd, dispatch}; // The WithSimd idea is adapted from pulp but is clunky; we // will probably prefer the `dispatch!` macro. struct Foo; impl WithSimd for Foo { type Output = f32; #[inline(always)] fn with_simd(self, simd: S) -> Self::Output { let a = simd.splat_f32x4(42.0); let b = a + a; b[0] } } #[inline(always)] fn foo(simd: S, x: f32) -> f32 { let n = S::f32s::N; println!("n = {n}"); simd.splat_f32x4(x).sqrt()[0] } // currently requires `safe_wrappers` feature fn do_something_on_neon(_level: Level) -> f32 { #[cfg(all(feature = "safe_wrappers", target_arch = "aarch64"))] if let Some(neon) = _level.as_neon() { return neon.vectorize( #[inline(always)] || { let v = neon.neon.vdupq_n_f32(42.0); neon.neon.vgetq_lane_f32::<0>(v) }, ); } 0.0 } fn main() { let level = Level::new(); let x = level.dispatch(Foo); let y = dispatch!(level, simd => foo(simd, 42.0)); let z = do_something_on_neon(level); println!("level = {level:?}, x = {x}, y = {y}, z = {z}"); } fearless_simd-0.3.0/examples/sigmoid.rs000064400000000000000000000014201046102023000162600ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT #![expect( missing_docs, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] use fearless_simd::{Level, Simd, SimdBase, SimdFloat, dispatch}; #[inline(always)] fn sigmoid(simd: S, x: &[f32], out: &mut [f32]) { let n = S::f32s::N; for (x, y) in x.chunks_exact(n).zip(out.chunks_exact_mut(n)) { let a = S::f32s::from_slice(simd, x); let b = a / (a * a + 1.0).sqrt(); y.copy_from_slice(b.as_slice()); } } fn main() { let level = Level::new(); let inp = [0.1, -0.2, 0.001, 0.4, 1., 2., 3., 4.]; let mut out = [0.; 8]; dispatch!(level, simd => sigmoid(simd, &inp, &mut out)); println!("{out:?}"); } fearless_simd-0.3.0/examples/srgb.rs000064400000000000000000000044121046102023000155660ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT #![expect( clippy::excessive_precision, missing_docs, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] use fearless_simd::{Level, Select, Simd, SimdInto, dispatch, f32x4}; // This block shows how to use safe wrappers for compile-time enforcement // of using valid SIMD intrinsics. #[cfg(feature = "safe_wrappers")] #[inline(always)] fn copy_alpha(a: f32x4, b: f32x4) -> f32x4 { // #[cfg(target_arch = "x86_64")] // if let Some(avx2) = a.simd.level().as_avx2() { // return avx2 // .sse4_1 // ._mm_blend_ps::<8>(a.into(), b.into()) // .simd_into(a.simd); // } #[cfg(target_arch = "aarch64")] if let Some(neon) = a.simd.level().as_neon() { return neon .neon .vcopyq_laneq_f32::<3, 3>(a.into(), b.into()) .simd_into(a.simd); } let mut result = a; result[3] = b[3]; result } // This block lets the example compile without safe wrappers. #[cfg(not(feature = "safe_wrappers"))] #[inline(always)] fn copy_alpha(a: f32x4, b: f32x4) -> f32x4 { #[cfg(target_arch = "aarch64")] if let Some(_neon) = a.simd.level().as_neon() { unsafe { return core::arch::aarch64::vcopyq_laneq_f32::<3, 3>(a.into(), b.into()) .simd_into(a.simd); } } let mut result = a; result[3] = b[3]; result } #[inline(always)] fn to_srgb(simd: S, rgba: [f32; 4]) -> [f32; 4] { let v: f32x4 = rgba.simd_into(simd); let vabs = v.abs(); let x = vabs - 5.35862651e-04; let x2 = x * x; let even1 = x * -9.12795913e-01 + -2.88143143e-02; let even2 = x2 * -7.29192910e-01 + even1; let odd1 = x * 1.06133172e+00 + 1.40194533e+00; let odd2 = x2 * 2.07758287e-01 + odd1; let poly = odd2 * x.sqrt() + even2; let lin = vabs * 12.92; let z = vabs.simd_gt(0.0031308).select(poly, lin); let z_signed = z.copysign(v); let result = copy_alpha(z_signed, v); result.into() } fn main() { let level = Level::new(); let rgba = [0.1, -0.2, 0.001, 0.4]; let srgb = dispatch!(level, simd=> to_srgb(simd, rgba)); println!("{srgb:?}"); } fearless_simd-0.3.0/src/core_arch/aarch64/mod.rs000064400000000000000000007264271046102023000175570ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to intrinsics on aarch64. // These implementations are cut and pasted from pulp. /// A token for Neon intrinsics on aarch64. #[derive(Clone, Copy, Debug)] pub struct Neon { _private: (), } impl Neon { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. #[inline] pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } } #[cfg(feature = "safe_wrappers")] use {crate::impl_macros::delegate, core::arch::aarch64::*}; #[cfg(feature = "safe_wrappers")] type p8 = u8; #[cfg(feature = "safe_wrappers")] type p16 = u16; #[cfg(feature = "safe_wrappers")] type p64 = u64; #[cfg(feature = "safe_wrappers")] type p128 = u128; #[cfg(feature = "safe_wrappers")] #[expect( clippy::missing_safety_doc, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] impl Neon { delegate! { core::arch::aarch64: fn vand_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vandq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vand_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vandq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vand_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vandq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vand_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vandq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vand_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vandq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vand_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vandq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vand_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vandq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vand_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vandq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vorr_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vorrq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vorr_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vorrq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vorr_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vorrq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vorr_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vorrq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vorr_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vorrq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vorr_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vorrq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vorr_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vorrq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vorr_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vorrq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn veor_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn veorq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn veor_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn veorq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn veor_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn veorq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn veor_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn veorq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn veor_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn veorq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn veor_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn veorq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn veor_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn veorq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn veor_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn veorq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vabd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vabdq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vabd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vabdq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vabd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vabdq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vabd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vabdq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vabd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vabdq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vabd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vabdq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vabd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vabdq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vabdl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t; fn vabdl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t; fn vabdl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t; fn vabdl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t; fn vabdl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t; fn vabdl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t; fn vceq_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vceqq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vceq_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vceqq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vceq_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vceqq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vceq_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t; fn vceqq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t; fn vceq_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t; fn vceqq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t; fn vceq_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t; fn vceqq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t; fn vceq_p8(a: poly8x8_t, b: poly8x8_t) -> uint8x8_t; fn vceqq_p8(a: poly8x16_t, b: poly8x16_t) -> uint8x16_t; fn vceq_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t; fn vceqq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t; fn vtst_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t; fn vtstq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t; fn vtst_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t; fn vtstq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t; fn vtst_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t; fn vtstq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t; fn vtst_p8(a: poly8x8_t, b: poly8x8_t) -> uint8x8_t; fn vtstq_p8(a: poly8x16_t, b: poly8x16_t) -> uint8x16_t; fn vtst_p16(a: poly16x4_t, b: poly16x4_t) -> uint16x4_t; fn vtstq_p16(a: poly16x8_t, b: poly16x8_t) -> uint16x8_t; fn vtst_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vtstq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vtst_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vtstq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vtst_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vtstq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vabs_f32(a: float32x2_t) -> float32x2_t; fn vabsq_f32(a: float32x4_t) -> float32x4_t; fn vcgt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t; fn vcgtq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t; fn vcgt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t; fn vcgtq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t; fn vcgt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t; fn vcgtq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t; fn vcgt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vcgtq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vcgt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vcgtq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vcgt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vcgtq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vcgt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t; fn vcgtq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t; fn vclt_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t; fn vcltq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t; fn vclt_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t; fn vcltq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t; fn vclt_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t; fn vcltq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t; fn vclt_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vcltq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vclt_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vcltq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vclt_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vcltq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vclt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t; fn vcltq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t; fn vcle_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t; fn vcleq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t; fn vcle_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t; fn vcleq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t; fn vcle_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t; fn vcleq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t; fn vcle_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vcleq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vcle_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vcleq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vcle_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vcleq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vcle_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t; fn vcleq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t; fn vcge_s8(a: int8x8_t, b: int8x8_t) -> uint8x8_t; fn vcgeq_s8(a: int8x16_t, b: int8x16_t) -> uint8x16_t; fn vcge_s16(a: int16x4_t, b: int16x4_t) -> uint16x4_t; fn vcgeq_s16(a: int16x8_t, b: int16x8_t) -> uint16x8_t; fn vcge_s32(a: int32x2_t, b: int32x2_t) -> uint32x2_t; fn vcgeq_s32(a: int32x4_t, b: int32x4_t) -> uint32x4_t; fn vcge_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vcgeq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vcge_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vcgeq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vcge_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vcgeq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vcge_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t; fn vcgeq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t; fn vcls_s8(a: int8x8_t) -> int8x8_t; fn vclsq_s8(a: int8x16_t) -> int8x16_t; fn vcls_s16(a: int16x4_t) -> int16x4_t; fn vclsq_s16(a: int16x8_t) -> int16x8_t; fn vcls_s32(a: int32x2_t) -> int32x2_t; fn vclsq_s32(a: int32x4_t) -> int32x4_t; fn vcls_u8(a: uint8x8_t) -> int8x8_t; fn vclsq_u8(a: uint8x16_t) -> int8x16_t; fn vcls_u16(a: uint16x4_t) -> int16x4_t; fn vclsq_u16(a: uint16x8_t) -> int16x8_t; fn vcls_u32(a: uint32x2_t) -> int32x2_t; fn vclsq_u32(a: uint32x4_t) -> int32x4_t; fn vclz_s8(a: int8x8_t) -> int8x8_t; fn vclzq_s8(a: int8x16_t) -> int8x16_t; fn vclz_s16(a: int16x4_t) -> int16x4_t; fn vclzq_s16(a: int16x8_t) -> int16x8_t; fn vclz_s32(a: int32x2_t) -> int32x2_t; fn vclzq_s32(a: int32x4_t) -> int32x4_t; fn vclz_u8(a: uint8x8_t) -> uint8x8_t; fn vclzq_u8(a: uint8x16_t) -> uint8x16_t; fn vclz_u16(a: uint16x4_t) -> uint16x4_t; fn vclzq_u16(a: uint16x8_t) -> uint16x8_t; fn vclz_u32(a: uint32x2_t) -> uint32x2_t; fn vclzq_u32(a: uint32x4_t) -> uint32x4_t; fn vcagt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t; fn vcagtq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t; fn vcage_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t; fn vcageq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t; fn vcalt_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t; fn vcaltq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t; fn vcale_f32(a: float32x2_t, b: float32x2_t) -> uint32x2_t; fn vcaleq_f32(a: float32x4_t, b: float32x4_t) -> uint32x4_t; fn vcreate_s8(a: u64) -> int8x8_t; fn vcreate_s16(a: u64) -> int16x4_t; fn vcreate_s32(a: u64) -> int32x2_t; fn vcreate_s64(a: u64) -> int64x1_t; fn vcreate_u8(a: u64) -> uint8x8_t; fn vcreate_u16(a: u64) -> uint16x4_t; fn vcreate_u32(a: u64) -> uint32x2_t; fn vcreate_u64(a: u64) -> uint64x1_t; fn vcreate_p8(a: u64) -> poly8x8_t; fn vcreate_p16(a: u64) -> poly16x4_t; fn vcreate_f32(a: u64) -> float32x2_t; fn vcvt_f32_s32(a: int32x2_t) -> float32x2_t; fn vcvtq_f32_s32(a: int32x4_t) -> float32x4_t; fn vcvt_f32_u32(a: uint32x2_t) -> float32x2_t; fn vcvtq_f32_u32(a: uint32x4_t) -> float32x4_t; fn vcvt_n_f32_s32(a: int32x2_t) -> float32x2_t; fn vcvtq_n_f32_s32(a: int32x4_t) -> float32x4_t; fn vcvt_n_f32_u32(a: uint32x2_t) -> float32x2_t; fn vcvtq_n_f32_u32(a: uint32x4_t) -> float32x4_t; fn vcvt_n_s32_f32(a: float32x2_t) -> int32x2_t; fn vcvtq_n_s32_f32(a: float32x4_t) -> int32x4_t; fn vcvt_n_u32_f32(a: float32x2_t) -> uint32x2_t; fn vcvtq_n_u32_f32(a: float32x4_t) -> uint32x4_t; fn vcvt_s32_f32(a: float32x2_t) -> int32x2_t; fn vcvtq_s32_f32(a: float32x4_t) -> int32x4_t; fn vcvt_u32_f32(a: float32x2_t) -> uint32x2_t; fn vcvtq_u32_f32(a: float32x4_t) -> uint32x4_t; fn vdup_lane_s8(a: int8x8_t) -> int8x8_t; fn vdupq_laneq_s8(a: int8x16_t) -> int8x16_t; fn vdup_lane_s16(a: int16x4_t) -> int16x4_t; fn vdupq_laneq_s16(a: int16x8_t) -> int16x8_t; fn vdup_lane_s32(a: int32x2_t) -> int32x2_t; fn vdupq_laneq_s32(a: int32x4_t) -> int32x4_t; fn vdup_laneq_s8(a: int8x16_t) -> int8x8_t; fn vdup_laneq_s16(a: int16x8_t) -> int16x4_t; fn vdup_laneq_s32(a: int32x4_t) -> int32x2_t; fn vdupq_lane_s8(a: int8x8_t) -> int8x16_t; fn vdupq_lane_s16(a: int16x4_t) -> int16x8_t; fn vdupq_lane_s32(a: int32x2_t) -> int32x4_t; fn vdup_lane_u8(a: uint8x8_t) -> uint8x8_t; fn vdupq_laneq_u8(a: uint8x16_t) -> uint8x16_t; fn vdup_lane_u16(a: uint16x4_t) -> uint16x4_t; fn vdupq_laneq_u16(a: uint16x8_t) -> uint16x8_t; fn vdup_lane_u32(a: uint32x2_t) -> uint32x2_t; fn vdupq_laneq_u32(a: uint32x4_t) -> uint32x4_t; fn vdup_laneq_u8(a: uint8x16_t) -> uint8x8_t; fn vdup_laneq_u16(a: uint16x8_t) -> uint16x4_t; fn vdup_laneq_u32(a: uint32x4_t) -> uint32x2_t; fn vdupq_lane_u8(a: uint8x8_t) -> uint8x16_t; fn vdupq_lane_u16(a: uint16x4_t) -> uint16x8_t; fn vdupq_lane_u32(a: uint32x2_t) -> uint32x4_t; fn vdup_lane_p8(a: poly8x8_t) -> poly8x8_t; fn vdupq_laneq_p8(a: poly8x16_t) -> poly8x16_t; fn vdup_lane_p16(a: poly16x4_t) -> poly16x4_t; fn vdupq_laneq_p16(a: poly16x8_t) -> poly16x8_t; fn vdup_laneq_p8(a: poly8x16_t) -> poly8x8_t; fn vdup_laneq_p16(a: poly16x8_t) -> poly16x4_t; fn vdupq_lane_p8(a: poly8x8_t) -> poly8x16_t; fn vdupq_lane_p16(a: poly16x4_t) -> poly16x8_t; fn vdupq_laneq_s64(a: int64x2_t) -> int64x2_t; fn vdupq_lane_s64(a: int64x1_t) -> int64x2_t; fn vdupq_laneq_u64(a: uint64x2_t) -> uint64x2_t; fn vdupq_lane_u64(a: uint64x1_t) -> uint64x2_t; fn vdup_lane_f32(a: float32x2_t) -> float32x2_t; fn vdupq_laneq_f32(a: float32x4_t) -> float32x4_t; fn vdup_laneq_f32(a: float32x4_t) -> float32x2_t; fn vdupq_lane_f32(a: float32x2_t) -> float32x4_t; fn vdup_lane_s64(a: int64x1_t) -> int64x1_t; fn vdup_lane_u64(a: uint64x1_t) -> uint64x1_t; fn vdup_laneq_s64(a: int64x2_t) -> int64x1_t; fn vdup_laneq_u64(a: uint64x2_t) -> uint64x1_t; fn vext_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vextq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vext_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vextq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vext_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vextq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vext_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vextq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vext_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vextq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vext_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vextq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vext_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vextq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vext_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vextq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; fn vextq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vextq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vext_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vextq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vmla_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t; fn vmlaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t; fn vmla_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t; fn vmlaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; fn vmla_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t; fn vmlaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; fn vmla_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t; fn vmlaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t; fn vmla_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t; fn vmlaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t; fn vmla_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t; fn vmlaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t; fn vmla_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t; fn vmlaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t; fn vmla_n_s16(a: int16x4_t, b: int16x4_t, c: i16) -> int16x4_t; fn vmlaq_n_s16(a: int16x8_t, b: int16x8_t, c: i16) -> int16x8_t; fn vmla_n_s32(a: int32x2_t, b: int32x2_t, c: i32) -> int32x2_t; fn vmlaq_n_s32(a: int32x4_t, b: int32x4_t, c: i32) -> int32x4_t; fn vmla_n_u16(a: uint16x4_t, b: uint16x4_t, c: u16) -> uint16x4_t; fn vmlaq_n_u16(a: uint16x8_t, b: uint16x8_t, c: u16) -> uint16x8_t; fn vmla_n_u32(a: uint32x2_t, b: uint32x2_t, c: u32) -> uint32x2_t; fn vmlaq_n_u32(a: uint32x4_t, b: uint32x4_t, c: u32) -> uint32x4_t; fn vmla_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t; fn vmlaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t; fn vmla_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t; fn vmla_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t; fn vmlaq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t; fn vmlaq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; fn vmla_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t; fn vmla_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t; fn vmlaq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t; fn vmlaq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; fn vmla_lane_u16( a: uint16x4_t, b: uint16x4_t, c: uint16x4_t, ) -> uint16x4_t; fn vmla_laneq_u16( a: uint16x4_t, b: uint16x4_t, c: uint16x8_t, ) -> uint16x4_t; fn vmlaq_lane_u16( a: uint16x8_t, b: uint16x8_t, c: uint16x4_t, ) -> uint16x8_t; fn vmlaq_laneq_u16( a: uint16x8_t, b: uint16x8_t, c: uint16x8_t, ) -> uint16x8_t; fn vmla_lane_u32( a: uint32x2_t, b: uint32x2_t, c: uint32x2_t, ) -> uint32x2_t; fn vmla_laneq_u32( a: uint32x2_t, b: uint32x2_t, c: uint32x4_t, ) -> uint32x2_t; fn vmlaq_lane_u32( a: uint32x4_t, b: uint32x4_t, c: uint32x2_t, ) -> uint32x4_t; fn vmlaq_laneq_u32( a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, ) -> uint32x4_t; fn vmla_lane_f32( a: float32x2_t, b: float32x2_t, c: float32x2_t, ) -> float32x2_t; fn vmla_laneq_f32( a: float32x2_t, b: float32x2_t, c: float32x4_t, ) -> float32x2_t; fn vmlaq_lane_f32( a: float32x4_t, b: float32x4_t, c: float32x2_t, ) -> float32x4_t; fn vmlaq_laneq_f32( a: float32x4_t, b: float32x4_t, c: float32x4_t, ) -> float32x4_t; fn vmlal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t; fn vmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t; fn vmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t; fn vmlal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t; fn vmlal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t; fn vmlal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t; fn vmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t; fn vmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t; fn vmlal_n_u16(a: uint32x4_t, b: uint16x4_t, c: u16) -> uint32x4_t; fn vmlal_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t; fn vmlal_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t; fn vmlal_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t; fn vmlal_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t; fn vmlal_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t; fn vmlal_lane_u16( a: uint32x4_t, b: uint16x4_t, c: uint16x4_t, ) -> uint32x4_t; fn vmlal_laneq_u16( a: uint32x4_t, b: uint16x4_t, c: uint16x8_t, ) -> uint32x4_t; fn vmlal_lane_u32( a: uint64x2_t, b: uint32x2_t, c: uint32x2_t, ) -> uint64x2_t; fn vmlal_laneq_u32( a: uint64x2_t, b: uint32x2_t, c: uint32x4_t, ) -> uint64x2_t; fn vmls_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t; fn vmlsq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t; fn vmls_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t; fn vmlsq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; fn vmls_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t; fn vmlsq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; fn vmls_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t; fn vmlsq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t; fn vmls_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t; fn vmlsq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t; fn vmls_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t; fn vmlsq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t; fn vmls_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t; fn vmlsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t; fn vmls_n_s16(a: int16x4_t, b: int16x4_t, c: i16) -> int16x4_t; fn vmlsq_n_s16(a: int16x8_t, b: int16x8_t, c: i16) -> int16x8_t; fn vmls_n_s32(a: int32x2_t, b: int32x2_t, c: i32) -> int32x2_t; fn vmlsq_n_s32(a: int32x4_t, b: int32x4_t, c: i32) -> int32x4_t; fn vmls_n_u16(a: uint16x4_t, b: uint16x4_t, c: u16) -> uint16x4_t; fn vmlsq_n_u16(a: uint16x8_t, b: uint16x8_t, c: u16) -> uint16x8_t; fn vmls_n_u32(a: uint32x2_t, b: uint32x2_t, c: u32) -> uint32x2_t; fn vmlsq_n_u32(a: uint32x4_t, b: uint32x4_t, c: u32) -> uint32x4_t; fn vmls_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t; fn vmlsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t; fn vmls_lane_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t; fn vmls_laneq_s16(a: int16x4_t, b: int16x4_t, c: int16x8_t) -> int16x4_t; fn vmlsq_lane_s16(a: int16x8_t, b: int16x8_t, c: int16x4_t) -> int16x8_t; fn vmlsq_laneq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; fn vmls_lane_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t; fn vmls_laneq_s32(a: int32x2_t, b: int32x2_t, c: int32x4_t) -> int32x2_t; fn vmlsq_lane_s32(a: int32x4_t, b: int32x4_t, c: int32x2_t) -> int32x4_t; fn vmlsq_laneq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; fn vmls_lane_u16( a: uint16x4_t, b: uint16x4_t, c: uint16x4_t, ) -> uint16x4_t; fn vmls_laneq_u16( a: uint16x4_t, b: uint16x4_t, c: uint16x8_t, ) -> uint16x4_t; fn vmlsq_lane_u16( a: uint16x8_t, b: uint16x8_t, c: uint16x4_t, ) -> uint16x8_t; fn vmlsq_laneq_u16( a: uint16x8_t, b: uint16x8_t, c: uint16x8_t, ) -> uint16x8_t; fn vmls_lane_u32( a: uint32x2_t, b: uint32x2_t, c: uint32x2_t, ) -> uint32x2_t; fn vmls_laneq_u32( a: uint32x2_t, b: uint32x2_t, c: uint32x4_t, ) -> uint32x2_t; fn vmlsq_lane_u32( a: uint32x4_t, b: uint32x4_t, c: uint32x2_t, ) -> uint32x4_t; fn vmlsq_laneq_u32( a: uint32x4_t, b: uint32x4_t, c: uint32x4_t, ) -> uint32x4_t; fn vmls_lane_f32( a: float32x2_t, b: float32x2_t, c: float32x2_t, ) -> float32x2_t; fn vmls_laneq_f32( a: float32x2_t, b: float32x2_t, c: float32x4_t, ) -> float32x2_t; fn vmlsq_lane_f32( a: float32x4_t, b: float32x4_t, c: float32x2_t, ) -> float32x4_t; fn vmlsq_laneq_f32( a: float32x4_t, b: float32x4_t, c: float32x4_t, ) -> float32x4_t; fn vmlsl_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t; fn vmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t; fn vmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t; fn vmlsl_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t; fn vmlsl_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t; fn vmlsl_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t; fn vmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t; fn vmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t; fn vmlsl_n_u16(a: uint32x4_t, b: uint16x4_t, c: u16) -> uint32x4_t; fn vmlsl_n_u32(a: uint64x2_t, b: uint32x2_t, c: u32) -> uint64x2_t; fn vmlsl_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t; fn vmlsl_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t; fn vmlsl_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t; fn vmlsl_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t; fn vmlsl_lane_u16( a: uint32x4_t, b: uint16x4_t, c: uint16x4_t, ) -> uint32x4_t; fn vmlsl_laneq_u16( a: uint32x4_t, b: uint16x4_t, c: uint16x8_t, ) -> uint32x4_t; fn vmlsl_lane_u32( a: uint64x2_t, b: uint32x2_t, c: uint32x2_t, ) -> uint64x2_t; fn vmlsl_laneq_u32( a: uint64x2_t, b: uint32x2_t, c: uint32x4_t, ) -> uint64x2_t; fn vneg_s8(a: int8x8_t) -> int8x8_t; fn vnegq_s8(a: int8x16_t) -> int8x16_t; fn vneg_s16(a: int16x4_t) -> int16x4_t; fn vnegq_s16(a: int16x8_t) -> int16x8_t; fn vneg_s32(a: int32x2_t) -> int32x2_t; fn vnegq_s32(a: int32x4_t) -> int32x4_t; fn vneg_f32(a: float32x2_t) -> float32x2_t; fn vnegq_f32(a: float32x4_t) -> float32x4_t; fn vqneg_s8(a: int8x8_t) -> int8x8_t; fn vqnegq_s8(a: int8x16_t) -> int8x16_t; fn vqneg_s16(a: int16x4_t) -> int16x4_t; fn vqnegq_s16(a: int16x8_t) -> int16x8_t; fn vqneg_s32(a: int32x2_t) -> int32x2_t; fn vqnegq_s32(a: int32x4_t) -> int32x4_t; fn vqsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vqsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vqsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vqsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vqsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vqsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vqsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vqsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vqsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vqsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vqsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vqsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vqsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vqsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vqsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vqsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vrhadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vrhaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vrhadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vrhaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vrhadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vrhaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vrhadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vrhaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vrhadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vrhaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vrhadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vrhaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vrndn_f32(a: float32x2_t) -> float32x2_t; fn vrndnq_f32(a: float32x4_t) -> float32x4_t; fn vqadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vqaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vqadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vqaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vqadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vqaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vqadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vqaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vqadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vqaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vqadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vqaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vqadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vqaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vqadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vqaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; unsafe fn vld1_s8_x2(a: *const i8) -> int8x8x2_t; unsafe fn vld1_s16_x2(a: *const i16) -> int16x4x2_t; unsafe fn vld1_s32_x2(a: *const i32) -> int32x2x2_t; unsafe fn vld1_s64_x2(a: *const i64) -> int64x1x2_t; unsafe fn vld1q_s8_x2(a: *const i8) -> int8x16x2_t; unsafe fn vld1q_s16_x2(a: *const i16) -> int16x8x2_t; unsafe fn vld1q_s32_x2(a: *const i32) -> int32x4x2_t; unsafe fn vld1q_s64_x2(a: *const i64) -> int64x2x2_t; unsafe fn vld1_s8_x3(a: *const i8) -> int8x8x3_t; unsafe fn vld1_s16_x3(a: *const i16) -> int16x4x3_t; unsafe fn vld1_s32_x3(a: *const i32) -> int32x2x3_t; unsafe fn vld1_s64_x3(a: *const i64) -> int64x1x3_t; unsafe fn vld1q_s8_x3(a: *const i8) -> int8x16x3_t; unsafe fn vld1q_s16_x3(a: *const i16) -> int16x8x3_t; unsafe fn vld1q_s32_x3(a: *const i32) -> int32x4x3_t; unsafe fn vld1q_s64_x3(a: *const i64) -> int64x2x3_t; unsafe fn vld1_s8_x4(a: *const i8) -> int8x8x4_t; unsafe fn vld1_s16_x4(a: *const i16) -> int16x4x4_t; unsafe fn vld1_s32_x4(a: *const i32) -> int32x2x4_t; unsafe fn vld1_s64_x4(a: *const i64) -> int64x1x4_t; unsafe fn vld1q_s8_x4(a: *const i8) -> int8x16x4_t; unsafe fn vld1q_s16_x4(a: *const i16) -> int16x8x4_t; unsafe fn vld1q_s32_x4(a: *const i32) -> int32x4x4_t; unsafe fn vld1q_s64_x4(a: *const i64) -> int64x2x4_t; unsafe fn vld1_u8_x2(a: *const u8) -> uint8x8x2_t; unsafe fn vld1_u16_x2(a: *const u16) -> uint16x4x2_t; unsafe fn vld1_u32_x2(a: *const u32) -> uint32x2x2_t; unsafe fn vld1_u64_x2(a: *const u64) -> uint64x1x2_t; unsafe fn vld1q_u8_x2(a: *const u8) -> uint8x16x2_t; unsafe fn vld1q_u16_x2(a: *const u16) -> uint16x8x2_t; unsafe fn vld1q_u32_x2(a: *const u32) -> uint32x4x2_t; unsafe fn vld1q_u64_x2(a: *const u64) -> uint64x2x2_t; unsafe fn vld1_u8_x3(a: *const u8) -> uint8x8x3_t; unsafe fn vld1_u16_x3(a: *const u16) -> uint16x4x3_t; unsafe fn vld1_u32_x3(a: *const u32) -> uint32x2x3_t; unsafe fn vld1_u64_x3(a: *const u64) -> uint64x1x3_t; unsafe fn vld1q_u8_x3(a: *const u8) -> uint8x16x3_t; unsafe fn vld1q_u16_x3(a: *const u16) -> uint16x8x3_t; unsafe fn vld1q_u32_x3(a: *const u32) -> uint32x4x3_t; unsafe fn vld1q_u64_x3(a: *const u64) -> uint64x2x3_t; unsafe fn vld1_u8_x4(a: *const u8) -> uint8x8x4_t; unsafe fn vld1_u16_x4(a: *const u16) -> uint16x4x4_t; unsafe fn vld1_u32_x4(a: *const u32) -> uint32x2x4_t; unsafe fn vld1_u64_x4(a: *const u64) -> uint64x1x4_t; unsafe fn vld1q_u8_x4(a: *const u8) -> uint8x16x4_t; unsafe fn vld1q_u16_x4(a: *const u16) -> uint16x8x4_t; unsafe fn vld1q_u32_x4(a: *const u32) -> uint32x4x4_t; unsafe fn vld1q_u64_x4(a: *const u64) -> uint64x2x4_t; unsafe fn vld1_p8_x2(a: *const p8) -> poly8x8x2_t; unsafe fn vld1_p8_x3(a: *const p8) -> poly8x8x3_t; unsafe fn vld1_p8_x4(a: *const p8) -> poly8x8x4_t; unsafe fn vld1q_p8_x2(a: *const p8) -> poly8x16x2_t; unsafe fn vld1q_p8_x3(a: *const p8) -> poly8x16x3_t; unsafe fn vld1q_p8_x4(a: *const p8) -> poly8x16x4_t; unsafe fn vld1_p16_x2(a: *const p16) -> poly16x4x2_t; unsafe fn vld1_p16_x3(a: *const p16) -> poly16x4x3_t; unsafe fn vld1_p16_x4(a: *const p16) -> poly16x4x4_t; unsafe fn vld1q_p16_x2(a: *const p16) -> poly16x8x2_t; unsafe fn vld1q_p16_x3(a: *const p16) -> poly16x8x3_t; unsafe fn vld1q_p16_x4(a: *const p16) -> poly16x8x4_t; unsafe fn vld1_f32_x2(a: *const f32) -> float32x2x2_t; unsafe fn vld1q_f32_x2(a: *const f32) -> float32x4x2_t; unsafe fn vld1_f32_x3(a: *const f32) -> float32x2x3_t; unsafe fn vld1q_f32_x3(a: *const f32) -> float32x4x3_t; unsafe fn vld1_f32_x4(a: *const f32) -> float32x2x4_t; unsafe fn vld1q_f32_x4(a: *const f32) -> float32x4x4_t; unsafe fn vld2_s8(a: *const i8) -> int8x8x2_t; unsafe fn vld2_s16(a: *const i16) -> int16x4x2_t; unsafe fn vld2_s32(a: *const i32) -> int32x2x2_t; unsafe fn vld2q_s8(a: *const i8) -> int8x16x2_t; unsafe fn vld2q_s16(a: *const i16) -> int16x8x2_t; unsafe fn vld2q_s32(a: *const i32) -> int32x4x2_t; unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t; unsafe fn vld2_u8(a: *const u8) -> uint8x8x2_t; unsafe fn vld2_u16(a: *const u16) -> uint16x4x2_t; unsafe fn vld2_u32(a: *const u32) -> uint32x2x2_t; unsafe fn vld2q_u8(a: *const u8) -> uint8x16x2_t; unsafe fn vld2q_u16(a: *const u16) -> uint16x8x2_t; unsafe fn vld2q_u32(a: *const u32) -> uint32x4x2_t; unsafe fn vld2_p8(a: *const p8) -> poly8x8x2_t; unsafe fn vld2_p16(a: *const p16) -> poly16x4x2_t; unsafe fn vld2q_p8(a: *const p8) -> poly8x16x2_t; unsafe fn vld2q_p16(a: *const p16) -> poly16x8x2_t; unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t; unsafe fn vld2_f32(a: *const f32) -> float32x2x2_t; unsafe fn vld2q_f32(a: *const f32) -> float32x4x2_t; unsafe fn vld2_dup_s8(a: *const i8) -> int8x8x2_t; unsafe fn vld2_dup_s16(a: *const i16) -> int16x4x2_t; unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t; unsafe fn vld2q_dup_s8(a: *const i8) -> int8x16x2_t; unsafe fn vld2q_dup_s16(a: *const i16) -> int16x8x2_t; unsafe fn vld2q_dup_s32(a: *const i32) -> int32x4x2_t; unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t; unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t; unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t; unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t; unsafe fn vld2q_dup_u8(a: *const u8) -> uint8x16x2_t; unsafe fn vld2q_dup_u16(a: *const u16) -> uint16x8x2_t; unsafe fn vld2q_dup_u32(a: *const u32) -> uint32x4x2_t; unsafe fn vld2_dup_p8(a: *const p8) -> poly8x8x2_t; unsafe fn vld2_dup_p16(a: *const p16) -> poly16x4x2_t; unsafe fn vld2q_dup_p8(a: *const p8) -> poly8x16x2_t; unsafe fn vld2q_dup_p16(a: *const p16) -> poly16x8x2_t; unsafe fn vld2_dup_u64(a: *const u64) -> uint64x1x2_t; unsafe fn vld2_dup_f32(a: *const f32) -> float32x2x2_t; unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t; unsafe fn vld2_lane_s8(a: *const i8, b: int8x8x2_t) -> int8x8x2_t; unsafe fn vld2_lane_s16(a: *const i16, b: int16x4x2_t) -> int16x4x2_t; unsafe fn vld2_lane_s32(a: *const i32, b: int32x2x2_t) -> int32x2x2_t; unsafe fn vld2q_lane_s16(a: *const i16, b: int16x8x2_t) -> int16x8x2_t; unsafe fn vld2q_lane_s32(a: *const i32, b: int32x4x2_t) -> int32x4x2_t; unsafe fn vld2_lane_u8(a: *const u8, b: uint8x8x2_t) -> uint8x8x2_t; unsafe fn vld2_lane_u16(a: *const u16, b: uint16x4x2_t) -> uint16x4x2_t; unsafe fn vld2_lane_u32(a: *const u32, b: uint32x2x2_t) -> uint32x2x2_t; unsafe fn vld2q_lane_u16(a: *const u16, b: uint16x8x2_t) -> uint16x8x2_t; unsafe fn vld2q_lane_u32(a: *const u32, b: uint32x4x2_t) -> uint32x4x2_t; unsafe fn vld2_lane_p8(a: *const p8, b: poly8x8x2_t) -> poly8x8x2_t; unsafe fn vld2_lane_p16(a: *const p16, b: poly16x4x2_t) -> poly16x4x2_t; unsafe fn vld2q_lane_p16(a: *const p16, b: poly16x8x2_t) -> poly16x8x2_t; unsafe fn vld2_lane_f32(a: *const f32, b: float32x2x2_t) -> float32x2x2_t; unsafe fn vld2q_lane_f32(a: *const f32, b: float32x4x2_t) -> float32x4x2_t; unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t; unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t; unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t; unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t; unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t; unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t; unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t; unsafe fn vld3_u8(a: *const u8) -> uint8x8x3_t; unsafe fn vld3_u16(a: *const u16) -> uint16x4x3_t; unsafe fn vld3_u32(a: *const u32) -> uint32x2x3_t; unsafe fn vld3q_u8(a: *const u8) -> uint8x16x3_t; unsafe fn vld3q_u16(a: *const u16) -> uint16x8x3_t; unsafe fn vld3q_u32(a: *const u32) -> uint32x4x3_t; unsafe fn vld3_p8(a: *const p8) -> poly8x8x3_t; unsafe fn vld3_p16(a: *const p16) -> poly16x4x3_t; unsafe fn vld3q_p8(a: *const p8) -> poly8x16x3_t; unsafe fn vld3q_p16(a: *const p16) -> poly16x8x3_t; unsafe fn vld3_u64(a: *const u64) -> uint64x1x3_t; unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t; unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t; unsafe fn vld3_dup_s8(a: *const i8) -> int8x8x3_t; unsafe fn vld3_dup_s16(a: *const i16) -> int16x4x3_t; unsafe fn vld3_dup_s32(a: *const i32) -> int32x2x3_t; unsafe fn vld3q_dup_s8(a: *const i8) -> int8x16x3_t; unsafe fn vld3q_dup_s16(a: *const i16) -> int16x8x3_t; unsafe fn vld3q_dup_s32(a: *const i32) -> int32x4x3_t; unsafe fn vld3_dup_s64(a: *const i64) -> int64x1x3_t; unsafe fn vld3_dup_u8(a: *const u8) -> uint8x8x3_t; unsafe fn vld3_dup_u16(a: *const u16) -> uint16x4x3_t; unsafe fn vld3_dup_u32(a: *const u32) -> uint32x2x3_t; unsafe fn vld3q_dup_u8(a: *const u8) -> uint8x16x3_t; unsafe fn vld3q_dup_u16(a: *const u16) -> uint16x8x3_t; unsafe fn vld3q_dup_u32(a: *const u32) -> uint32x4x3_t; unsafe fn vld3_dup_p8(a: *const p8) -> poly8x8x3_t; unsafe fn vld3_dup_p16(a: *const p16) -> poly16x4x3_t; unsafe fn vld3q_dup_p8(a: *const p8) -> poly8x16x3_t; unsafe fn vld3q_dup_p16(a: *const p16) -> poly16x8x3_t; unsafe fn vld3_dup_u64(a: *const u64) -> uint64x1x3_t; unsafe fn vld3_dup_f32(a: *const f32) -> float32x2x3_t; unsafe fn vld3q_dup_f32(a: *const f32) -> float32x4x3_t; unsafe fn vld3_lane_s8(a: *const i8, b: int8x8x3_t) -> int8x8x3_t; unsafe fn vld3_lane_s16(a: *const i16, b: int16x4x3_t) -> int16x4x3_t; unsafe fn vld3_lane_s32(a: *const i32, b: int32x2x3_t) -> int32x2x3_t; unsafe fn vld3q_lane_s16(a: *const i16, b: int16x8x3_t) -> int16x8x3_t; unsafe fn vld3q_lane_s32(a: *const i32, b: int32x4x3_t) -> int32x4x3_t; unsafe fn vld3_lane_u8(a: *const u8, b: uint8x8x3_t) -> uint8x8x3_t; unsafe fn vld3_lane_u16(a: *const u16, b: uint16x4x3_t) -> uint16x4x3_t; unsafe fn vld3_lane_u32(a: *const u32, b: uint32x2x3_t) -> uint32x2x3_t; unsafe fn vld3q_lane_u16(a: *const u16, b: uint16x8x3_t) -> uint16x8x3_t; unsafe fn vld3q_lane_u32(a: *const u32, b: uint32x4x3_t) -> uint32x4x3_t; unsafe fn vld3_lane_p8(a: *const p8, b: poly8x8x3_t) -> poly8x8x3_t; unsafe fn vld3_lane_p16(a: *const p16, b: poly16x4x3_t) -> poly16x4x3_t; unsafe fn vld3q_lane_p16(a: *const p16, b: poly16x8x3_t) -> poly16x8x3_t; unsafe fn vld3_lane_f32(a: *const f32, b: float32x2x3_t) -> float32x2x3_t; unsafe fn vld3q_lane_f32(a: *const f32, b: float32x4x3_t) -> float32x4x3_t; unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t; unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t; unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t; unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t; unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t; unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t; unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t; unsafe fn vld4_u8(a: *const u8) -> uint8x8x4_t; unsafe fn vld4_u16(a: *const u16) -> uint16x4x4_t; unsafe fn vld4_u32(a: *const u32) -> uint32x2x4_t; unsafe fn vld4q_u8(a: *const u8) -> uint8x16x4_t; unsafe fn vld4q_u16(a: *const u16) -> uint16x8x4_t; unsafe fn vld4q_u32(a: *const u32) -> uint32x4x4_t; unsafe fn vld4_p8(a: *const p8) -> poly8x8x4_t; unsafe fn vld4_p16(a: *const p16) -> poly16x4x4_t; unsafe fn vld4q_p8(a: *const p8) -> poly8x16x4_t; unsafe fn vld4q_p16(a: *const p16) -> poly16x8x4_t; unsafe fn vld4_u64(a: *const u64) -> uint64x1x4_t; unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t; unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t; unsafe fn vld4_dup_s8(a: *const i8) -> int8x8x4_t; unsafe fn vld4_dup_s16(a: *const i16) -> int16x4x4_t; unsafe fn vld4_dup_s32(a: *const i32) -> int32x2x4_t; unsafe fn vld4q_dup_s8(a: *const i8) -> int8x16x4_t; unsafe fn vld4q_dup_s16(a: *const i16) -> int16x8x4_t; unsafe fn vld4q_dup_s32(a: *const i32) -> int32x4x4_t; unsafe fn vld4_dup_s64(a: *const i64) -> int64x1x4_t; unsafe fn vld4_dup_u8(a: *const u8) -> uint8x8x4_t; unsafe fn vld4_dup_u16(a: *const u16) -> uint16x4x4_t; unsafe fn vld4_dup_u32(a: *const u32) -> uint32x2x4_t; unsafe fn vld4q_dup_u8(a: *const u8) -> uint8x16x4_t; unsafe fn vld4q_dup_u16(a: *const u16) -> uint16x8x4_t; unsafe fn vld4q_dup_u32(a: *const u32) -> uint32x4x4_t; unsafe fn vld4_dup_p8(a: *const p8) -> poly8x8x4_t; unsafe fn vld4_dup_p16(a: *const p16) -> poly16x4x4_t; unsafe fn vld4q_dup_p8(a: *const p8) -> poly8x16x4_t; unsafe fn vld4q_dup_p16(a: *const p16) -> poly16x8x4_t; unsafe fn vld4_dup_u64(a: *const u64) -> uint64x1x4_t; unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t; unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t; unsafe fn vld4_lane_s8(a: *const i8, b: int8x8x4_t) -> int8x8x4_t; unsafe fn vld4_lane_s16(a: *const i16, b: int16x4x4_t) -> int16x4x4_t; unsafe fn vld4_lane_s32(a: *const i32, b: int32x2x4_t) -> int32x2x4_t; unsafe fn vld4q_lane_s16(a: *const i16, b: int16x8x4_t) -> int16x8x4_t; unsafe fn vld4q_lane_s32(a: *const i32, b: int32x4x4_t) -> int32x4x4_t; unsafe fn vld4_lane_u8(a: *const u8, b: uint8x8x4_t) -> uint8x8x4_t; unsafe fn vld4_lane_u16(a: *const u16, b: uint16x4x4_t) -> uint16x4x4_t; unsafe fn vld4_lane_u32(a: *const u32, b: uint32x2x4_t) -> uint32x2x4_t; unsafe fn vld4q_lane_u16(a: *const u16, b: uint16x8x4_t) -> uint16x8x4_t; unsafe fn vld4q_lane_u32(a: *const u32, b: uint32x4x4_t) -> uint32x4x4_t; unsafe fn vld4_lane_p8(a: *const p8, b: poly8x8x4_t) -> poly8x8x4_t; unsafe fn vld4_lane_p16(a: *const p16, b: poly16x4x4_t) -> poly16x4x4_t; unsafe fn vld4q_lane_p16(a: *const p16, b: poly16x8x4_t) -> poly16x8x4_t; unsafe fn vld4_lane_f32(a: *const f32, b: float32x2x4_t) -> float32x2x4_t; unsafe fn vld4q_lane_f32(a: *const f32, b: float32x4x4_t) -> float32x4x4_t; unsafe fn vst1_lane_s8(a: *mut i8, b: int8x8_t); unsafe fn vst1_lane_s16(a: *mut i16, b: int16x4_t); unsafe fn vst1_lane_s32(a: *mut i32, b: int32x2_t); unsafe fn vst1_lane_s64(a: *mut i64, b: int64x1_t); unsafe fn vst1q_lane_s8(a: *mut i8, b: int8x16_t); unsafe fn vst1q_lane_s16(a: *mut i16, b: int16x8_t); unsafe fn vst1q_lane_s32(a: *mut i32, b: int32x4_t); unsafe fn vst1q_lane_s64(a: *mut i64, b: int64x2_t); unsafe fn vst1_lane_u8(a: *mut u8, b: uint8x8_t); unsafe fn vst1_lane_u16(a: *mut u16, b: uint16x4_t); unsafe fn vst1_lane_u32(a: *mut u32, b: uint32x2_t); unsafe fn vst1_lane_u64(a: *mut u64, b: uint64x1_t); unsafe fn vst1q_lane_u8(a: *mut u8, b: uint8x16_t); unsafe fn vst1q_lane_u16(a: *mut u16, b: uint16x8_t); unsafe fn vst1q_lane_u32(a: *mut u32, b: uint32x4_t); unsafe fn vst1q_lane_u64(a: *mut u64, b: uint64x2_t); unsafe fn vst1_lane_p8(a: *mut p8, b: poly8x8_t); unsafe fn vst1_lane_p16(a: *mut p16, b: poly16x4_t); unsafe fn vst1q_lane_p8(a: *mut p8, b: poly8x16_t); unsafe fn vst1q_lane_p16(a: *mut p16, b: poly16x8_t); unsafe fn vst1q_lane_p64(a: *mut p64, b: poly64x2_t); unsafe fn vst1_lane_f32(a: *mut f32, b: float32x2_t); unsafe fn vst1q_lane_f32(a: *mut f32, b: float32x4_t); unsafe fn vst1_s8_x2(a: *mut i8, b: int8x8x2_t); unsafe fn vst1_s16_x2(a: *mut i16, b: int16x4x2_t); unsafe fn vst1_s32_x2(a: *mut i32, b: int32x2x2_t); unsafe fn vst1_s64_x2(a: *mut i64, b: int64x1x2_t); unsafe fn vst1q_s8_x2(a: *mut i8, b: int8x16x2_t); unsafe fn vst1q_s16_x2(a: *mut i16, b: int16x8x2_t); unsafe fn vst1q_s32_x2(a: *mut i32, b: int32x4x2_t); unsafe fn vst1q_s64_x2(a: *mut i64, b: int64x2x2_t); unsafe fn vst1_s8_x3(a: *mut i8, b: int8x8x3_t); unsafe fn vst1_s16_x3(a: *mut i16, b: int16x4x3_t); unsafe fn vst1_s32_x3(a: *mut i32, b: int32x2x3_t); unsafe fn vst1_s64_x3(a: *mut i64, b: int64x1x3_t); unsafe fn vst1q_s8_x3(a: *mut i8, b: int8x16x3_t); unsafe fn vst1q_s16_x3(a: *mut i16, b: int16x8x3_t); unsafe fn vst1q_s32_x3(a: *mut i32, b: int32x4x3_t); unsafe fn vst1q_s64_x3(a: *mut i64, b: int64x2x3_t); unsafe fn vst1_s8_x4(a: *mut i8, b: int8x8x4_t); unsafe fn vst1_s16_x4(a: *mut i16, b: int16x4x4_t); unsafe fn vst1_s32_x4(a: *mut i32, b: int32x2x4_t); unsafe fn vst1_s64_x4(a: *mut i64, b: int64x1x4_t); unsafe fn vst1q_s8_x4(a: *mut i8, b: int8x16x4_t); unsafe fn vst1q_s16_x4(a: *mut i16, b: int16x8x4_t); unsafe fn vst1q_s32_x4(a: *mut i32, b: int32x4x4_t); unsafe fn vst1q_s64_x4(a: *mut i64, b: int64x2x4_t); unsafe fn vst1_u8_x2(a: *mut u8, b: uint8x8x2_t); unsafe fn vst1_u16_x2(a: *mut u16, b: uint16x4x2_t); unsafe fn vst1_u32_x2(a: *mut u32, b: uint32x2x2_t); unsafe fn vst1_u64_x2(a: *mut u64, b: uint64x1x2_t); unsafe fn vst1q_u8_x2(a: *mut u8, b: uint8x16x2_t); unsafe fn vst1q_u16_x2(a: *mut u16, b: uint16x8x2_t); unsafe fn vst1q_u32_x2(a: *mut u32, b: uint32x4x2_t); unsafe fn vst1q_u64_x2(a: *mut u64, b: uint64x2x2_t); unsafe fn vst1_u8_x3(a: *mut u8, b: uint8x8x3_t); unsafe fn vst1_u16_x3(a: *mut u16, b: uint16x4x3_t); unsafe fn vst1_u32_x3(a: *mut u32, b: uint32x2x3_t); unsafe fn vst1_u64_x3(a: *mut u64, b: uint64x1x3_t); unsafe fn vst1q_u8_x3(a: *mut u8, b: uint8x16x3_t); unsafe fn vst1q_u16_x3(a: *mut u16, b: uint16x8x3_t); unsafe fn vst1q_u32_x3(a: *mut u32, b: uint32x4x3_t); unsafe fn vst1q_u64_x3(a: *mut u64, b: uint64x2x3_t); unsafe fn vst1_u8_x4(a: *mut u8, b: uint8x8x4_t); unsafe fn vst1_u16_x4(a: *mut u16, b: uint16x4x4_t); unsafe fn vst1_u32_x4(a: *mut u32, b: uint32x2x4_t); unsafe fn vst1_u64_x4(a: *mut u64, b: uint64x1x4_t); unsafe fn vst1q_u8_x4(a: *mut u8, b: uint8x16x4_t); unsafe fn vst1q_u16_x4(a: *mut u16, b: uint16x8x4_t); unsafe fn vst1q_u32_x4(a: *mut u32, b: uint32x4x4_t); unsafe fn vst1q_u64_x4(a: *mut u64, b: uint64x2x4_t); unsafe fn vst1_p8_x2(a: *mut p8, b: poly8x8x2_t); unsafe fn vst1_p8_x3(a: *mut p8, b: poly8x8x3_t); unsafe fn vst1_p8_x4(a: *mut p8, b: poly8x8x4_t); unsafe fn vst1q_p8_x2(a: *mut p8, b: poly8x16x2_t); unsafe fn vst1q_p8_x3(a: *mut p8, b: poly8x16x3_t); unsafe fn vst1q_p8_x4(a: *mut p8, b: poly8x16x4_t); unsafe fn vst1_p16_x2(a: *mut p16, b: poly16x4x2_t); unsafe fn vst1_p16_x3(a: *mut p16, b: poly16x4x3_t); unsafe fn vst1_p16_x4(a: *mut p16, b: poly16x4x4_t); unsafe fn vst1q_p16_x2(a: *mut p16, b: poly16x8x2_t); unsafe fn vst1q_p16_x3(a: *mut p16, b: poly16x8x3_t); unsafe fn vst1q_p16_x4(a: *mut p16, b: poly16x8x4_t); unsafe fn vst1_f32_x2(a: *mut f32, b: float32x2x2_t); unsafe fn vst1q_f32_x2(a: *mut f32, b: float32x4x2_t); unsafe fn vst1_f32_x3(a: *mut f32, b: float32x2x3_t); unsafe fn vst1q_f32_x3(a: *mut f32, b: float32x4x3_t); unsafe fn vst1_f32_x4(a: *mut f32, b: float32x2x4_t); unsafe fn vst1q_f32_x4(a: *mut f32, b: float32x4x4_t); unsafe fn vst2_s8(a: *mut i8, b: int8x8x2_t); unsafe fn vst2_s16(a: *mut i16, b: int16x4x2_t); unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t); unsafe fn vst2q_s8(a: *mut i8, b: int8x16x2_t); unsafe fn vst2q_s16(a: *mut i16, b: int16x8x2_t); unsafe fn vst2q_s32(a: *mut i32, b: int32x4x2_t); unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t); unsafe fn vst2_u8(a: *mut u8, b: uint8x8x2_t); unsafe fn vst2_u16(a: *mut u16, b: uint16x4x2_t); unsafe fn vst2_u32(a: *mut u32, b: uint32x2x2_t); unsafe fn vst2q_u8(a: *mut u8, b: uint8x16x2_t); unsafe fn vst2q_u16(a: *mut u16, b: uint16x8x2_t); unsafe fn vst2q_u32(a: *mut u32, b: uint32x4x2_t); unsafe fn vst2_p8(a: *mut p8, b: poly8x8x2_t); unsafe fn vst2_p16(a: *mut p16, b: poly16x4x2_t); unsafe fn vst2q_p8(a: *mut p8, b: poly8x16x2_t); unsafe fn vst2q_p16(a: *mut p16, b: poly16x8x2_t); unsafe fn vst2_u64(a: *mut u64, b: uint64x1x2_t); unsafe fn vst2_f32(a: *mut f32, b: float32x2x2_t); unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t); unsafe fn vst2_lane_s8(a: *mut i8, b: int8x8x2_t); unsafe fn vst2_lane_s16(a: *mut i16, b: int16x4x2_t); unsafe fn vst2_lane_s32(a: *mut i32, b: int32x2x2_t); unsafe fn vst2q_lane_s16(a: *mut i16, b: int16x8x2_t); unsafe fn vst2q_lane_s32(a: *mut i32, b: int32x4x2_t); unsafe fn vst2_lane_u8(a: *mut u8, b: uint8x8x2_t); unsafe fn vst2_lane_u16(a: *mut u16, b: uint16x4x2_t); unsafe fn vst2_lane_u32(a: *mut u32, b: uint32x2x2_t); unsafe fn vst2q_lane_u16(a: *mut u16, b: uint16x8x2_t); unsafe fn vst2q_lane_u32(a: *mut u32, b: uint32x4x2_t); unsafe fn vst2_lane_p8(a: *mut p8, b: poly8x8x2_t); unsafe fn vst2_lane_p16(a: *mut p16, b: poly16x4x2_t); unsafe fn vst2q_lane_p16(a: *mut p16, b: poly16x8x2_t); unsafe fn vst2_lane_f32(a: *mut f32, b: float32x2x2_t); unsafe fn vst2q_lane_f32(a: *mut f32, b: float32x4x2_t); unsafe fn vst3_s8(a: *mut i8, b: int8x8x3_t); unsafe fn vst3_s16(a: *mut i16, b: int16x4x3_t); unsafe fn vst3_s32(a: *mut i32, b: int32x2x3_t); unsafe fn vst3q_s8(a: *mut i8, b: int8x16x3_t); unsafe fn vst3q_s16(a: *mut i16, b: int16x8x3_t); unsafe fn vst3q_s32(a: *mut i32, b: int32x4x3_t); unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t); unsafe fn vst3_u8(a: *mut u8, b: uint8x8x3_t); unsafe fn vst3_u16(a: *mut u16, b: uint16x4x3_t); unsafe fn vst3_u32(a: *mut u32, b: uint32x2x3_t); unsafe fn vst3q_u8(a: *mut u8, b: uint8x16x3_t); unsafe fn vst3q_u16(a: *mut u16, b: uint16x8x3_t); unsafe fn vst3q_u32(a: *mut u32, b: uint32x4x3_t); unsafe fn vst3_p8(a: *mut p8, b: poly8x8x3_t); unsafe fn vst3_p16(a: *mut p16, b: poly16x4x3_t); unsafe fn vst3q_p8(a: *mut p8, b: poly8x16x3_t); unsafe fn vst3q_p16(a: *mut p16, b: poly16x8x3_t); unsafe fn vst3_u64(a: *mut u64, b: uint64x1x3_t); unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t); unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t); unsafe fn vst3_lane_s8(a: *mut i8, b: int8x8x3_t); unsafe fn vst3_lane_s16(a: *mut i16, b: int16x4x3_t); unsafe fn vst3_lane_s32(a: *mut i32, b: int32x2x3_t); unsafe fn vst3q_lane_s16(a: *mut i16, b: int16x8x3_t); unsafe fn vst3q_lane_s32(a: *mut i32, b: int32x4x3_t); unsafe fn vst3_lane_u8(a: *mut u8, b: uint8x8x3_t); unsafe fn vst3_lane_u16(a: *mut u16, b: uint16x4x3_t); unsafe fn vst3_lane_u32(a: *mut u32, b: uint32x2x3_t); unsafe fn vst3q_lane_u16(a: *mut u16, b: uint16x8x3_t); unsafe fn vst3q_lane_u32(a: *mut u32, b: uint32x4x3_t); unsafe fn vst3_lane_p8(a: *mut p8, b: poly8x8x3_t); unsafe fn vst3_lane_p16(a: *mut p16, b: poly16x4x3_t); unsafe fn vst3q_lane_p16(a: *mut p16, b: poly16x8x3_t); unsafe fn vst3_lane_f32(a: *mut f32, b: float32x2x3_t); unsafe fn vst3q_lane_f32(a: *mut f32, b: float32x4x3_t); unsafe fn vst4_s8(a: *mut i8, b: int8x8x4_t); unsafe fn vst4_s16(a: *mut i16, b: int16x4x4_t); unsafe fn vst4_s32(a: *mut i32, b: int32x2x4_t); unsafe fn vst4q_s8(a: *mut i8, b: int8x16x4_t); unsafe fn vst4q_s16(a: *mut i16, b: int16x8x4_t); unsafe fn vst4q_s32(a: *mut i32, b: int32x4x4_t); unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t); unsafe fn vst4_u8(a: *mut u8, b: uint8x8x4_t); unsafe fn vst4_u16(a: *mut u16, b: uint16x4x4_t); unsafe fn vst4_u32(a: *mut u32, b: uint32x2x4_t); unsafe fn vst4q_u8(a: *mut u8, b: uint8x16x4_t); unsafe fn vst4q_u16(a: *mut u16, b: uint16x8x4_t); unsafe fn vst4q_u32(a: *mut u32, b: uint32x4x4_t); unsafe fn vst4_p8(a: *mut p8, b: poly8x8x4_t); unsafe fn vst4_p16(a: *mut p16, b: poly16x4x4_t); unsafe fn vst4q_p8(a: *mut p8, b: poly8x16x4_t); unsafe fn vst4q_p16(a: *mut p16, b: poly16x8x4_t); unsafe fn vst4_u64(a: *mut u64, b: uint64x1x4_t); unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t); unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t); unsafe fn vst4_lane_s8(a: *mut i8, b: int8x8x4_t); unsafe fn vst4_lane_s16(a: *mut i16, b: int16x4x4_t); unsafe fn vst4_lane_s32(a: *mut i32, b: int32x2x4_t); unsafe fn vst4q_lane_s16(a: *mut i16, b: int16x8x4_t); unsafe fn vst4q_lane_s32(a: *mut i32, b: int32x4x4_t); unsafe fn vst4_lane_u8(a: *mut u8, b: uint8x8x4_t); unsafe fn vst4_lane_u16(a: *mut u16, b: uint16x4x4_t); unsafe fn vst4_lane_u32(a: *mut u32, b: uint32x2x4_t); unsafe fn vst4q_lane_u16(a: *mut u16, b: uint16x8x4_t); unsafe fn vst4q_lane_u32(a: *mut u32, b: uint32x4x4_t); unsafe fn vst4_lane_p8(a: *mut p8, b: poly8x8x4_t); unsafe fn vst4_lane_p16(a: *mut p16, b: poly16x4x4_t); unsafe fn vst4q_lane_p16(a: *mut p16, b: poly16x8x4_t); unsafe fn vst4_lane_f32(a: *mut f32, b: float32x2x4_t); unsafe fn vst4q_lane_f32(a: *mut f32, b: float32x4x4_t); fn vmul_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vmulq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vmul_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vmulq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vmul_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vmulq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vmul_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vmulq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vmul_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vmulq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vmul_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vmulq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vmul_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vmulq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vmul_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vmulq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vmul_n_s16(a: int16x4_t, b: i16) -> int16x4_t; fn vmulq_n_s16(a: int16x8_t, b: i16) -> int16x8_t; fn vmul_n_s32(a: int32x2_t, b: i32) -> int32x2_t; fn vmulq_n_s32(a: int32x4_t, b: i32) -> int32x4_t; fn vmul_n_u16(a: uint16x4_t, b: u16) -> uint16x4_t; fn vmulq_n_u16(a: uint16x8_t, b: u16) -> uint16x8_t; fn vmul_n_u32(a: uint32x2_t, b: u32) -> uint32x2_t; fn vmulq_n_u32(a: uint32x4_t, b: u32) -> uint32x4_t; fn vmul_n_f32(a: float32x2_t, b: f32) -> float32x2_t; fn vmulq_n_f32(a: float32x4_t, b: f32) -> float32x4_t; fn vmul_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vmul_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x4_t; fn vmulq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x8_t; fn vmulq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vmul_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vmul_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x2_t; fn vmulq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x4_t; fn vmulq_laneq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vmul_lane_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vmul_laneq_u16(a: uint16x4_t, b: uint16x8_t) -> uint16x4_t; fn vmulq_lane_u16(a: uint16x8_t, b: uint16x4_t) -> uint16x8_t; fn vmulq_laneq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vmul_lane_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vmul_laneq_u32(a: uint32x2_t, b: uint32x4_t) -> uint32x2_t; fn vmulq_lane_u32(a: uint32x4_t, b: uint32x2_t) -> uint32x4_t; fn vmulq_laneq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vmul_lane_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vmul_laneq_f32(a: float32x2_t, b: float32x4_t) -> float32x2_t; fn vmulq_lane_f32(a: float32x4_t, b: float32x2_t) -> float32x4_t; fn vmulq_laneq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vmull_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t; fn vmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t; fn vmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t; fn vmull_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t; fn vmull_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t; fn vmull_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t; fn vmull_p8(a: poly8x8_t, b: poly8x8_t) -> poly16x8_t; fn vmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t; fn vmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t; fn vmull_n_u16(a: uint16x4_t, b: u16) -> uint32x4_t; fn vmull_n_u32(a: uint32x2_t, b: u32) -> uint64x2_t; fn vmull_lane_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t; fn vmull_laneq_s16(a: int16x4_t, b: int16x8_t) -> int32x4_t; fn vmull_lane_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t; fn vmull_laneq_s32(a: int32x2_t, b: int32x4_t) -> int64x2_t; fn vmull_lane_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t; fn vmull_laneq_u16(a: uint16x4_t, b: uint16x8_t) -> uint32x4_t; fn vmull_lane_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t; fn vmull_laneq_u32(a: uint32x2_t, b: uint32x4_t) -> uint64x2_t; fn vfma_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t; fn vfmaq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t; fn vfma_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t; fn vfmaq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t; fn vfms_f32(a: float32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t; fn vfmsq_f32(a: float32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t; fn vfms_n_f32(a: float32x2_t, b: float32x2_t, c: f32) -> float32x2_t; fn vfmsq_n_f32(a: float32x4_t, b: float32x4_t, c: f32) -> float32x4_t; fn vsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vsub_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vsubq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vsub_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vsubq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vsub_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vsubq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vadd_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vadd_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vaddq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vaddq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; fn vadd_p64(a: poly64x1_t, b: poly64x1_t) -> poly64x1_t; fn vaddq_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t; fn vaddq_p128(a: p128, b: p128) -> p128; fn vsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t; fn vsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t; fn vsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t; fn vsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t; fn vsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t; fn vsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t; fn vsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t; fn vsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t; fn vsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t; fn vsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t; fn vsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t; fn vsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t; fn vhsub_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vhsubq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vhsub_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vhsubq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vhsub_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vhsubq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vhsub_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vhsubq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vhsub_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vhsubq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vhsub_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vhsubq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vsubw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t; fn vsubw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t; fn vsubw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t; fn vsubw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t; fn vsubw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t; fn vsubw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t; fn vsubl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t; fn vsubl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t; fn vsubl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t; fn vsubl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t; fn vsubl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t; fn vsubl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t; fn vmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vpadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vqdmull_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t; fn vqdmull_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t; fn vqdmull_n_s16(a: int16x4_t, b: i16) -> int32x4_t; fn vqdmull_n_s32(a: int32x2_t, b: i32) -> int64x2_t; fn vqdmull_lane_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t; fn vqdmull_lane_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t; fn vqdmlal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t; fn vqdmlal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t; fn vqdmlal_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t; fn vqdmlal_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t; fn vqdmlal_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t; fn vqdmlal_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t; fn vqdmlsl_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t; fn vqdmlsl_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t; fn vqdmlsl_n_s16(a: int32x4_t, b: int16x4_t, c: i16) -> int32x4_t; fn vqdmlsl_n_s32(a: int64x2_t, b: int32x2_t, c: i32) -> int64x2_t; fn vqdmlsl_lane_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t; fn vqdmlsl_lane_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t; fn vqdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vqdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vqdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vqdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vqdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t; fn vqdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t; fn vqdmulhq_n_s16(a: int16x8_t, b: i16) -> int16x8_t; fn vqdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t; fn vqdmulhq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vqdmulh_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x4_t; fn vqdmulhq_laneq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vqdmulh_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x2_t; fn vqmovn_s16(a: int16x8_t) -> int8x8_t; fn vqmovn_s32(a: int32x4_t) -> int16x4_t; fn vqmovn_s64(a: int64x2_t) -> int32x2_t; fn vqmovn_u16(a: uint16x8_t) -> uint8x8_t; fn vqmovn_u32(a: uint32x4_t) -> uint16x4_t; fn vqmovn_u64(a: uint64x2_t) -> uint32x2_t; fn vqmovun_s16(a: int16x8_t) -> uint8x8_t; fn vqmovun_s32(a: int32x4_t) -> uint16x4_t; fn vqmovun_s64(a: int64x2_t) -> uint32x2_t; fn vqrdmulh_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vqrdmulhq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vqrdmulh_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vqrdmulhq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vqrdmulh_n_s16(a: int16x4_t, b: i16) -> int16x4_t; fn vqrdmulhq_n_s16(a: int16x8_t, b: i16) -> int16x8_t; fn vqrdmulh_n_s32(a: int32x2_t, b: i32) -> int32x2_t; fn vqrdmulhq_n_s32(a: int32x4_t, b: i32) -> int32x4_t; fn vqrdmulh_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vqrdmulh_laneq_s16(a: int16x4_t, b: int16x8_t) -> int16x4_t; fn vqrdmulhq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x8_t; fn vqrdmulhq_laneq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vqrdmulh_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vqrdmulh_laneq_s32(a: int32x2_t, b: int32x4_t) -> int32x2_t; fn vqrdmulhq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x4_t; fn vqrdmulhq_laneq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vqrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vqrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vqrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vqrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vqrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vqrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vqrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vqrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vqrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t; fn vqrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t; fn vqrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t; fn vqrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t; fn vqrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t; fn vqrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t; fn vqrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t; fn vqrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t; fn vqrshrn_n_s16(a: int16x8_t) -> int8x8_t; fn vqrshrn_n_s32(a: int32x4_t) -> int16x4_t; fn vqrshrn_n_s64(a: int64x2_t) -> int32x2_t; fn vqrshrn_n_u16(a: uint16x8_t) -> uint8x8_t; fn vqrshrn_n_u32(a: uint32x4_t) -> uint16x4_t; fn vqrshrn_n_u64(a: uint64x2_t) -> uint32x2_t; fn vqrshrun_n_s16(a: int16x8_t) -> uint8x8_t; fn vqrshrun_n_s32(a: int32x4_t) -> uint16x4_t; fn vqrshrun_n_s64(a: int64x2_t) -> uint32x2_t; fn vqshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vqshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vqshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vqshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vqshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vqshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vqshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vqshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vqshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t; fn vqshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t; fn vqshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t; fn vqshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t; fn vqshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t; fn vqshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t; fn vqshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t; fn vqshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t; fn vqshl_n_s8(a: int8x8_t) -> int8x8_t; fn vqshlq_n_s8(a: int8x16_t) -> int8x16_t; fn vqshl_n_s16(a: int16x4_t) -> int16x4_t; fn vqshlq_n_s16(a: int16x8_t) -> int16x8_t; fn vqshl_n_s32(a: int32x2_t) -> int32x2_t; fn vqshlq_n_s32(a: int32x4_t) -> int32x4_t; fn vqshl_n_s64(a: int64x1_t) -> int64x1_t; fn vqshlq_n_s64(a: int64x2_t) -> int64x2_t; fn vqshl_n_u8(a: uint8x8_t) -> uint8x8_t; fn vqshlq_n_u8(a: uint8x16_t) -> uint8x16_t; fn vqshl_n_u16(a: uint16x4_t) -> uint16x4_t; fn vqshlq_n_u16(a: uint16x8_t) -> uint16x8_t; fn vqshl_n_u32(a: uint32x2_t) -> uint32x2_t; fn vqshlq_n_u32(a: uint32x4_t) -> uint32x4_t; fn vqshl_n_u64(a: uint64x1_t) -> uint64x1_t; fn vqshlq_n_u64(a: uint64x2_t) -> uint64x2_t; fn vqshlu_n_s8(a: int8x8_t) -> uint8x8_t; fn vqshlu_n_s16(a: int16x4_t) -> uint16x4_t; fn vqshlu_n_s32(a: int32x2_t) -> uint32x2_t; fn vqshlu_n_s64(a: int64x1_t) -> uint64x1_t; fn vqshluq_n_s8(a: int8x16_t) -> uint8x16_t; fn vqshluq_n_s16(a: int16x8_t) -> uint16x8_t; fn vqshluq_n_s32(a: int32x4_t) -> uint32x4_t; fn vqshluq_n_s64(a: int64x2_t) -> uint64x2_t; fn vqshrn_n_s16(a: int16x8_t) -> int8x8_t; fn vqshrn_n_s32(a: int32x4_t) -> int16x4_t; fn vqshrn_n_s64(a: int64x2_t) -> int32x2_t; fn vqshrn_n_u16(a: uint16x8_t) -> uint8x8_t; fn vqshrn_n_u32(a: uint32x4_t) -> uint16x4_t; fn vqshrn_n_u64(a: uint64x2_t) -> uint32x2_t; fn vqshrun_n_s16(a: int16x8_t) -> uint8x8_t; fn vqshrun_n_s32(a: int32x4_t) -> uint16x4_t; fn vqshrun_n_s64(a: int64x2_t) -> uint32x2_t; fn vrsqrte_f32(a: float32x2_t) -> float32x2_t; fn vrsqrteq_f32(a: float32x4_t) -> float32x4_t; fn vrsqrte_u32(a: uint32x2_t) -> uint32x2_t; fn vrsqrteq_u32(a: uint32x4_t) -> uint32x4_t; fn vrsqrts_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vrsqrtsq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vrecpe_f32(a: float32x2_t) -> float32x2_t; fn vrecpeq_f32(a: float32x4_t) -> float32x4_t; fn vrecpe_u32(a: uint32x2_t) -> uint32x2_t; fn vrecpeq_u32(a: uint32x4_t) -> uint32x4_t; fn vrecps_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vrecpsq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vreinterpret_s8_u8(a: uint8x8_t) -> int8x8_t; fn vreinterpret_s8_p8(a: poly8x8_t) -> int8x8_t; fn vreinterpret_s16_p16(a: poly16x4_t) -> int16x4_t; fn vreinterpret_s16_u16(a: uint16x4_t) -> int16x4_t; fn vreinterpret_s32_u32(a: uint32x2_t) -> int32x2_t; fn vreinterpret_s64_u64(a: uint64x1_t) -> int64x1_t; fn vreinterpretq_s8_u8(a: uint8x16_t) -> int8x16_t; fn vreinterpretq_s8_p8(a: poly8x16_t) -> int8x16_t; fn vreinterpretq_s16_p16(a: poly16x8_t) -> int16x8_t; fn vreinterpretq_s16_u16(a: uint16x8_t) -> int16x8_t; fn vreinterpretq_s32_u32(a: uint32x4_t) -> int32x4_t; fn vreinterpretq_s64_u64(a: uint64x2_t) -> int64x2_t; fn vreinterpret_u8_p8(a: poly8x8_t) -> uint8x8_t; fn vreinterpret_u8_s8(a: int8x8_t) -> uint8x8_t; fn vreinterpret_u16_p16(a: poly16x4_t) -> uint16x4_t; fn vreinterpret_u16_s16(a: int16x4_t) -> uint16x4_t; fn vreinterpret_u32_s32(a: int32x2_t) -> uint32x2_t; fn vreinterpret_u64_s64(a: int64x1_t) -> uint64x1_t; fn vreinterpretq_u8_p8(a: poly8x16_t) -> uint8x16_t; fn vreinterpretq_u8_s8(a: int8x16_t) -> uint8x16_t; fn vreinterpretq_u16_p16(a: poly16x8_t) -> uint16x8_t; fn vreinterpretq_u16_s16(a: int16x8_t) -> uint16x8_t; fn vreinterpretq_u32_s32(a: int32x4_t) -> uint32x4_t; fn vreinterpretq_u64_s64(a: int64x2_t) -> uint64x2_t; fn vreinterpret_p8_s8(a: int8x8_t) -> poly8x8_t; fn vreinterpret_p8_u8(a: uint8x8_t) -> poly8x8_t; fn vreinterpret_p16_s16(a: int16x4_t) -> poly16x4_t; fn vreinterpret_p16_u16(a: uint16x4_t) -> poly16x4_t; fn vreinterpretq_p8_s8(a: int8x16_t) -> poly8x16_t; fn vreinterpretq_p8_u8(a: uint8x16_t) -> poly8x16_t; fn vreinterpretq_p16_s16(a: int16x8_t) -> poly16x8_t; fn vreinterpretq_p16_u16(a: uint16x8_t) -> poly16x8_t; fn vreinterpret_s8_s16(a: int16x4_t) -> int8x8_t; fn vreinterpret_s8_u16(a: uint16x4_t) -> int8x8_t; fn vreinterpret_s8_p16(a: poly16x4_t) -> int8x8_t; fn vreinterpret_s16_s32(a: int32x2_t) -> int16x4_t; fn vreinterpret_s16_u32(a: uint32x2_t) -> int16x4_t; fn vreinterpret_s32_s64(a: int64x1_t) -> int32x2_t; fn vreinterpret_s32_u64(a: uint64x1_t) -> int32x2_t; fn vreinterpretq_s8_s16(a: int16x8_t) -> int8x16_t; fn vreinterpretq_s8_u16(a: uint16x8_t) -> int8x16_t; fn vreinterpretq_s8_p16(a: poly16x8_t) -> int8x16_t; fn vreinterpretq_s16_s32(a: int32x4_t) -> int16x8_t; fn vreinterpretq_s16_u32(a: uint32x4_t) -> int16x8_t; fn vreinterpretq_s32_s64(a: int64x2_t) -> int32x4_t; fn vreinterpretq_s32_u64(a: uint64x2_t) -> int32x4_t; fn vreinterpret_u8_p16(a: poly16x4_t) -> uint8x8_t; fn vreinterpret_u8_s16(a: int16x4_t) -> uint8x8_t; fn vreinterpret_u8_u16(a: uint16x4_t) -> uint8x8_t; fn vreinterpret_u16_s32(a: int32x2_t) -> uint16x4_t; fn vreinterpret_u16_u32(a: uint32x2_t) -> uint16x4_t; fn vreinterpret_u32_s64(a: int64x1_t) -> uint32x2_t; fn vreinterpret_u32_u64(a: uint64x1_t) -> uint32x2_t; fn vreinterpretq_u8_p16(a: poly16x8_t) -> uint8x16_t; fn vreinterpretq_u8_s16(a: int16x8_t) -> uint8x16_t; fn vreinterpretq_u8_u16(a: uint16x8_t) -> uint8x16_t; fn vreinterpretq_u16_s32(a: int32x4_t) -> uint16x8_t; fn vreinterpretq_u16_u32(a: uint32x4_t) -> uint16x8_t; fn vreinterpretq_u32_s64(a: int64x2_t) -> uint32x4_t; fn vreinterpretq_u32_u64(a: uint64x2_t) -> uint32x4_t; fn vreinterpret_p8_p16(a: poly16x4_t) -> poly8x8_t; fn vreinterpret_p8_s16(a: int16x4_t) -> poly8x8_t; fn vreinterpret_p8_u16(a: uint16x4_t) -> poly8x8_t; fn vreinterpret_p16_s32(a: int32x2_t) -> poly16x4_t; fn vreinterpret_p16_u32(a: uint32x2_t) -> poly16x4_t; fn vreinterpretq_p8_p16(a: poly16x8_t) -> poly8x16_t; fn vreinterpretq_p8_s16(a: int16x8_t) -> poly8x16_t; fn vreinterpretq_p8_u16(a: uint16x8_t) -> poly8x16_t; fn vreinterpretq_p16_s32(a: int32x4_t) -> poly16x8_t; fn vreinterpretq_p16_u32(a: uint32x4_t) -> poly16x8_t; fn vreinterpret_u32_p64(a: poly64x1_t) -> uint32x2_t; fn vreinterpret_s16_p8(a: poly8x8_t) -> int16x4_t; fn vreinterpret_s16_s8(a: int8x8_t) -> int16x4_t; fn vreinterpret_s16_u8(a: uint8x8_t) -> int16x4_t; fn vreinterpret_s32_p16(a: poly16x4_t) -> int32x2_t; fn vreinterpret_s32_s16(a: int16x4_t) -> int32x2_t; fn vreinterpret_s32_u16(a: uint16x4_t) -> int32x2_t; fn vreinterpret_s64_s32(a: int32x2_t) -> int64x1_t; fn vreinterpret_s64_u32(a: uint32x2_t) -> int64x1_t; fn vreinterpretq_s16_p8(a: poly8x16_t) -> int16x8_t; fn vreinterpretq_s16_s8(a: int8x16_t) -> int16x8_t; fn vreinterpretq_s16_u8(a: uint8x16_t) -> int16x8_t; fn vreinterpretq_s32_p16(a: poly16x8_t) -> int32x4_t; fn vreinterpretq_s32_s16(a: int16x8_t) -> int32x4_t; fn vreinterpretq_s32_u16(a: uint16x8_t) -> int32x4_t; fn vreinterpretq_s64_s32(a: int32x4_t) -> int64x2_t; fn vreinterpretq_s64_u32(a: uint32x4_t) -> int64x2_t; fn vreinterpret_u16_p8(a: poly8x8_t) -> uint16x4_t; fn vreinterpret_u16_s8(a: int8x8_t) -> uint16x4_t; fn vreinterpret_u16_u8(a: uint8x8_t) -> uint16x4_t; fn vreinterpret_u32_p16(a: poly16x4_t) -> uint32x2_t; fn vreinterpret_u32_s16(a: int16x4_t) -> uint32x2_t; fn vreinterpret_u32_u16(a: uint16x4_t) -> uint32x2_t; fn vreinterpret_u64_s32(a: int32x2_t) -> uint64x1_t; fn vreinterpret_u64_u32(a: uint32x2_t) -> uint64x1_t; fn vreinterpretq_u16_p8(a: poly8x16_t) -> uint16x8_t; fn vreinterpretq_u16_s8(a: int8x16_t) -> uint16x8_t; fn vreinterpretq_u16_u8(a: uint8x16_t) -> uint16x8_t; fn vreinterpretq_u32_p16(a: poly16x8_t) -> uint32x4_t; fn vreinterpretq_u32_s16(a: int16x8_t) -> uint32x4_t; fn vreinterpretq_u32_u16(a: uint16x8_t) -> uint32x4_t; fn vreinterpretq_u64_s32(a: int32x4_t) -> uint64x2_t; fn vreinterpretq_u64_u32(a: uint32x4_t) -> uint64x2_t; fn vreinterpret_p16_p8(a: poly8x8_t) -> poly16x4_t; fn vreinterpret_p16_s8(a: int8x8_t) -> poly16x4_t; fn vreinterpret_p16_u8(a: uint8x8_t) -> poly16x4_t; fn vreinterpretq_p16_p8(a: poly8x16_t) -> poly16x8_t; fn vreinterpretq_p16_s8(a: int8x16_t) -> poly16x8_t; fn vreinterpretq_p16_u8(a: uint8x16_t) -> poly16x8_t; fn vreinterpret_s8_s32(a: int32x2_t) -> int8x8_t; fn vreinterpret_s8_u32(a: uint32x2_t) -> int8x8_t; fn vreinterpret_s16_s64(a: int64x1_t) -> int16x4_t; fn vreinterpret_s16_u64(a: uint64x1_t) -> int16x4_t; fn vreinterpretq_s8_s32(a: int32x4_t) -> int8x16_t; fn vreinterpretq_s8_u32(a: uint32x4_t) -> int8x16_t; fn vreinterpretq_s16_s64(a: int64x2_t) -> int16x8_t; fn vreinterpretq_s16_u64(a: uint64x2_t) -> int16x8_t; fn vreinterpret_u8_s32(a: int32x2_t) -> uint8x8_t; fn vreinterpret_u8_u32(a: uint32x2_t) -> uint8x8_t; fn vreinterpret_u16_s64(a: int64x1_t) -> uint16x4_t; fn vreinterpret_u16_u64(a: uint64x1_t) -> uint16x4_t; fn vreinterpretq_u8_s32(a: int32x4_t) -> uint8x16_t; fn vreinterpretq_u8_u32(a: uint32x4_t) -> uint8x16_t; fn vreinterpretq_u16_s64(a: int64x2_t) -> uint16x8_t; fn vreinterpretq_u16_u64(a: uint64x2_t) -> uint16x8_t; fn vreinterpret_p8_s32(a: int32x2_t) -> poly8x8_t; fn vreinterpret_p8_u32(a: uint32x2_t) -> poly8x8_t; fn vreinterpret_p16_s64(a: int64x1_t) -> poly16x4_t; fn vreinterpret_p16_u64(a: uint64x1_t) -> poly16x4_t; fn vreinterpretq_p8_s32(a: int32x4_t) -> poly8x16_t; fn vreinterpretq_p8_u32(a: uint32x4_t) -> poly8x16_t; fn vreinterpretq_p16_s64(a: int64x2_t) -> poly16x8_t; fn vreinterpretq_p16_u64(a: uint64x2_t) -> poly16x8_t; fn vreinterpret_s32_p8(a: poly8x8_t) -> int32x2_t; fn vreinterpret_s32_s8(a: int8x8_t) -> int32x2_t; fn vreinterpret_s32_u8(a: uint8x8_t) -> int32x2_t; fn vreinterpret_s64_p16(a: poly16x4_t) -> int64x1_t; fn vreinterpret_s64_s16(a: int16x4_t) -> int64x1_t; fn vreinterpret_s64_u16(a: uint16x4_t) -> int64x1_t; fn vreinterpretq_s32_p8(a: poly8x16_t) -> int32x4_t; fn vreinterpretq_s32_s8(a: int8x16_t) -> int32x4_t; fn vreinterpretq_s32_u8(a: uint8x16_t) -> int32x4_t; fn vreinterpretq_s64_p16(a: poly16x8_t) -> int64x2_t; fn vreinterpretq_s64_s16(a: int16x8_t) -> int64x2_t; fn vreinterpretq_s64_u16(a: uint16x8_t) -> int64x2_t; fn vreinterpret_u32_p8(a: poly8x8_t) -> uint32x2_t; fn vreinterpret_u32_s8(a: int8x8_t) -> uint32x2_t; fn vreinterpret_u32_u8(a: uint8x8_t) -> uint32x2_t; fn vreinterpret_u64_p16(a: poly16x4_t) -> uint64x1_t; fn vreinterpret_u64_s16(a: int16x4_t) -> uint64x1_t; fn vreinterpret_u64_u16(a: uint16x4_t) -> uint64x1_t; fn vreinterpretq_u32_p8(a: poly8x16_t) -> uint32x4_t; fn vreinterpretq_u32_s8(a: int8x16_t) -> uint32x4_t; fn vreinterpretq_u32_u8(a: uint8x16_t) -> uint32x4_t; fn vreinterpretq_u64_p16(a: poly16x8_t) -> uint64x2_t; fn vreinterpretq_u64_s16(a: int16x8_t) -> uint64x2_t; fn vreinterpretq_u64_u16(a: uint16x8_t) -> uint64x2_t; fn vreinterpret_s8_s64(a: int64x1_t) -> int8x8_t; fn vreinterpret_s8_u64(a: uint64x1_t) -> int8x8_t; fn vreinterpret_u8_s64(a: int64x1_t) -> uint8x8_t; fn vreinterpret_u8_u64(a: uint64x1_t) -> uint8x8_t; fn vreinterpret_p8_s64(a: int64x1_t) -> poly8x8_t; fn vreinterpret_p8_u64(a: uint64x1_t) -> poly8x8_t; fn vreinterpretq_s8_s64(a: int64x2_t) -> int8x16_t; fn vreinterpretq_s8_u64(a: uint64x2_t) -> int8x16_t; fn vreinterpretq_u8_s64(a: int64x2_t) -> uint8x16_t; fn vreinterpretq_u8_u64(a: uint64x2_t) -> uint8x16_t; fn vreinterpretq_p8_s64(a: int64x2_t) -> poly8x16_t; fn vreinterpretq_p8_u64(a: uint64x2_t) -> poly8x16_t; fn vreinterpret_s64_p8(a: poly8x8_t) -> int64x1_t; fn vreinterpret_s64_s8(a: int8x8_t) -> int64x1_t; fn vreinterpret_s64_u8(a: uint8x8_t) -> int64x1_t; fn vreinterpret_u64_p8(a: poly8x8_t) -> uint64x1_t; fn vreinterpret_u64_s8(a: int8x8_t) -> uint64x1_t; fn vreinterpret_u64_u8(a: uint8x8_t) -> uint64x1_t; fn vreinterpretq_s64_p8(a: poly8x16_t) -> int64x2_t; fn vreinterpretq_s64_s8(a: int8x16_t) -> int64x2_t; fn vreinterpretq_s64_u8(a: uint8x16_t) -> int64x2_t; fn vreinterpretq_u64_p8(a: poly8x16_t) -> uint64x2_t; fn vreinterpretq_u64_s8(a: int8x16_t) -> uint64x2_t; fn vreinterpretq_u64_u8(a: uint8x16_t) -> uint64x2_t; fn vreinterpret_s8_f32(a: float32x2_t) -> int8x8_t; fn vreinterpret_s16_f32(a: float32x2_t) -> int16x4_t; fn vreinterpret_s32_f32(a: float32x2_t) -> int32x2_t; fn vreinterpret_s64_f32(a: float32x2_t) -> int64x1_t; fn vreinterpretq_s8_f32(a: float32x4_t) -> int8x16_t; fn vreinterpretq_s16_f32(a: float32x4_t) -> int16x8_t; fn vreinterpretq_s32_f32(a: float32x4_t) -> int32x4_t; fn vreinterpretq_s64_f32(a: float32x4_t) -> int64x2_t; fn vreinterpret_u8_f32(a: float32x2_t) -> uint8x8_t; fn vreinterpret_u16_f32(a: float32x2_t) -> uint16x4_t; fn vreinterpret_u32_f32(a: float32x2_t) -> uint32x2_t; fn vreinterpret_u64_f32(a: float32x2_t) -> uint64x1_t; fn vreinterpretq_u8_f32(a: float32x4_t) -> uint8x16_t; fn vreinterpretq_u16_f32(a: float32x4_t) -> uint16x8_t; fn vreinterpretq_u32_f32(a: float32x4_t) -> uint32x4_t; fn vreinterpretq_u64_f32(a: float32x4_t) -> uint64x2_t; fn vreinterpret_p8_f32(a: float32x2_t) -> poly8x8_t; fn vreinterpret_p16_f32(a: float32x2_t) -> poly16x4_t; fn vreinterpretq_p8_f32(a: float32x4_t) -> poly8x16_t; fn vreinterpretq_p16_f32(a: float32x4_t) -> poly16x8_t; fn vreinterpretq_p128_f32(a: float32x4_t) -> p128; fn vreinterpret_f32_s8(a: int8x8_t) -> float32x2_t; fn vreinterpret_f32_s16(a: int16x4_t) -> float32x2_t; fn vreinterpret_f32_s32(a: int32x2_t) -> float32x2_t; fn vreinterpret_f32_s64(a: int64x1_t) -> float32x2_t; fn vreinterpretq_f32_s8(a: int8x16_t) -> float32x4_t; fn vreinterpretq_f32_s16(a: int16x8_t) -> float32x4_t; fn vreinterpretq_f32_s32(a: int32x4_t) -> float32x4_t; fn vreinterpretq_f32_s64(a: int64x2_t) -> float32x4_t; fn vreinterpret_f32_u8(a: uint8x8_t) -> float32x2_t; fn vreinterpret_f32_u16(a: uint16x4_t) -> float32x2_t; fn vreinterpret_f32_u32(a: uint32x2_t) -> float32x2_t; fn vreinterpret_f32_u64(a: uint64x1_t) -> float32x2_t; fn vreinterpretq_f32_u8(a: uint8x16_t) -> float32x4_t; fn vreinterpretq_f32_u16(a: uint16x8_t) -> float32x4_t; fn vreinterpretq_f32_u32(a: uint32x4_t) -> float32x4_t; fn vreinterpretq_f32_u64(a: uint64x2_t) -> float32x4_t; fn vreinterpret_f32_p8(a: poly8x8_t) -> float32x2_t; fn vreinterpret_f32_p16(a: poly16x4_t) -> float32x2_t; fn vreinterpretq_f32_p8(a: poly8x16_t) -> float32x4_t; fn vreinterpretq_f32_p16(a: poly16x8_t) -> float32x4_t; fn vreinterpretq_f32_p128(a: p128) -> float32x4_t; fn vrshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vrshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vrshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vrshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vrshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vrshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vrshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vrshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vrshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t; fn vrshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t; fn vrshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t; fn vrshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t; fn vrshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t; fn vrshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t; fn vrshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t; fn vrshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t; fn vrshr_n_s8(a: int8x8_t) -> int8x8_t; fn vrshrq_n_s8(a: int8x16_t) -> int8x16_t; fn vrshr_n_s16(a: int16x4_t) -> int16x4_t; fn vrshrq_n_s16(a: int16x8_t) -> int16x8_t; fn vrshr_n_s32(a: int32x2_t) -> int32x2_t; fn vrshrq_n_s32(a: int32x4_t) -> int32x4_t; fn vrshr_n_s64(a: int64x1_t) -> int64x1_t; fn vrshrq_n_s64(a: int64x2_t) -> int64x2_t; fn vrshr_n_u8(a: uint8x8_t) -> uint8x8_t; fn vrshrq_n_u8(a: uint8x16_t) -> uint8x16_t; fn vrshr_n_u16(a: uint16x4_t) -> uint16x4_t; fn vrshrq_n_u16(a: uint16x8_t) -> uint16x8_t; fn vrshr_n_u32(a: uint32x2_t) -> uint32x2_t; fn vrshrq_n_u32(a: uint32x4_t) -> uint32x4_t; fn vrshr_n_u64(a: uint64x1_t) -> uint64x1_t; fn vrshrq_n_u64(a: uint64x2_t) -> uint64x2_t; fn vrshrn_n_s16(a: int16x8_t) -> int8x8_t; fn vrshrn_n_s32(a: int32x4_t) -> int16x4_t; fn vrshrn_n_s64(a: int64x2_t) -> int32x2_t; fn vrshrn_n_u16(a: uint16x8_t) -> uint8x8_t; fn vrshrn_n_u32(a: uint32x4_t) -> uint16x4_t; fn vrshrn_n_u64(a: uint64x2_t) -> uint32x2_t; fn vrsra_n_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vrsraq_n_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vrsra_n_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vrsraq_n_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vrsra_n_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vrsraq_n_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vrsra_n_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vrsraq_n_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vrsra_n_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vrsraq_n_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vrsra_n_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vrsraq_n_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vrsra_n_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vrsraq_n_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vrsra_n_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vrsraq_n_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vrsubhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t; fn vrsubhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t; fn vrsubhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t; fn vrsubhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t; fn vrsubhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t; fn vrsubhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t; fn vset_lane_s8(a: i8, b: int8x8_t) -> int8x8_t; fn vset_lane_s16(a: i16, b: int16x4_t) -> int16x4_t; fn vset_lane_s32(a: i32, b: int32x2_t) -> int32x2_t; fn vset_lane_s64(a: i64, b: int64x1_t) -> int64x1_t; fn vset_lane_u8(a: u8, b: uint8x8_t) -> uint8x8_t; fn vset_lane_u16(a: u16, b: uint16x4_t) -> uint16x4_t; fn vset_lane_u32(a: u32, b: uint32x2_t) -> uint32x2_t; fn vset_lane_u64(a: u64, b: uint64x1_t) -> uint64x1_t; fn vset_lane_p8(a: p8, b: poly8x8_t) -> poly8x8_t; fn vset_lane_p16(a: p16, b: poly16x4_t) -> poly16x4_t; fn vsetq_lane_s8(a: i8, b: int8x16_t) -> int8x16_t; fn vsetq_lane_s16(a: i16, b: int16x8_t) -> int16x8_t; fn vsetq_lane_s32(a: i32, b: int32x4_t) -> int32x4_t; fn vsetq_lane_s64(a: i64, b: int64x2_t) -> int64x2_t; fn vsetq_lane_u8(a: u8, b: uint8x16_t) -> uint8x16_t; fn vsetq_lane_u16(a: u16, b: uint16x8_t) -> uint16x8_t; fn vsetq_lane_u32(a: u32, b: uint32x4_t) -> uint32x4_t; fn vsetq_lane_u64(a: u64, b: uint64x2_t) -> uint64x2_t; fn vsetq_lane_p8(a: p8, b: poly8x16_t) -> poly8x16_t; fn vsetq_lane_p16(a: p16, b: poly16x8_t) -> poly16x8_t; fn vset_lane_f32(a: f32, b: float32x2_t) -> float32x2_t; fn vsetq_lane_f32(a: f32, b: float32x4_t) -> float32x4_t; fn vshl_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vshlq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vshl_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vshlq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vshl_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vshlq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vshl_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vshlq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vshl_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t; fn vshlq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t; fn vshl_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t; fn vshlq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t; fn vshl_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t; fn vshlq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t; fn vshl_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t; fn vshlq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t; fn vshl_n_s8(a: int8x8_t) -> int8x8_t; fn vshlq_n_s8(a: int8x16_t) -> int8x16_t; fn vshl_n_s16(a: int16x4_t) -> int16x4_t; fn vshlq_n_s16(a: int16x8_t) -> int16x8_t; fn vshl_n_s32(a: int32x2_t) -> int32x2_t; fn vshlq_n_s32(a: int32x4_t) -> int32x4_t; fn vshl_n_u8(a: uint8x8_t) -> uint8x8_t; fn vshlq_n_u8(a: uint8x16_t) -> uint8x16_t; fn vshl_n_u16(a: uint16x4_t) -> uint16x4_t; fn vshlq_n_u16(a: uint16x8_t) -> uint16x8_t; fn vshl_n_u32(a: uint32x2_t) -> uint32x2_t; fn vshlq_n_u32(a: uint32x4_t) -> uint32x4_t; fn vshl_n_s64(a: int64x1_t) -> int64x1_t; fn vshlq_n_s64(a: int64x2_t) -> int64x2_t; fn vshl_n_u64(a: uint64x1_t) -> uint64x1_t; fn vshlq_n_u64(a: uint64x2_t) -> uint64x2_t; fn vshll_n_s8(a: int8x8_t) -> int16x8_t; fn vshll_n_s16(a: int16x4_t) -> int32x4_t; fn vshll_n_s32(a: int32x2_t) -> int64x2_t; fn vshll_n_u8(a: uint8x8_t) -> uint16x8_t; fn vshll_n_u16(a: uint16x4_t) -> uint32x4_t; fn vshll_n_u32(a: uint32x2_t) -> uint64x2_t; fn vshr_n_s8(a: int8x8_t) -> int8x8_t; fn vshrq_n_s8(a: int8x16_t) -> int8x16_t; fn vshr_n_s16(a: int16x4_t) -> int16x4_t; fn vshrq_n_s16(a: int16x8_t) -> int16x8_t; fn vshr_n_s32(a: int32x2_t) -> int32x2_t; fn vshrq_n_s32(a: int32x4_t) -> int32x4_t; fn vshr_n_s64(a: int64x1_t) -> int64x1_t; fn vshrq_n_s64(a: int64x2_t) -> int64x2_t; fn vshr_n_u8(a: uint8x8_t) -> uint8x8_t; fn vshrq_n_u8(a: uint8x16_t) -> uint8x16_t; fn vshr_n_u16(a: uint16x4_t) -> uint16x4_t; fn vshrq_n_u16(a: uint16x8_t) -> uint16x8_t; fn vshr_n_u32(a: uint32x2_t) -> uint32x2_t; fn vshrq_n_u32(a: uint32x4_t) -> uint32x4_t; fn vshr_n_u64(a: uint64x1_t) -> uint64x1_t; fn vshrq_n_u64(a: uint64x2_t) -> uint64x2_t; fn vshrn_n_s16(a: int16x8_t) -> int8x8_t; fn vshrn_n_s32(a: int32x4_t) -> int16x4_t; fn vshrn_n_s64(a: int64x2_t) -> int32x2_t; fn vshrn_n_u16(a: uint16x8_t) -> uint8x8_t; fn vshrn_n_u32(a: uint32x4_t) -> uint16x4_t; fn vshrn_n_u64(a: uint64x2_t) -> uint32x2_t; fn vsra_n_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vsraq_n_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vsra_n_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vsraq_n_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vsra_n_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vsraq_n_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vsra_n_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vsraq_n_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vsra_n_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vsraq_n_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vsra_n_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vsraq_n_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vsra_n_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vsraq_n_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vsra_n_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vsraq_n_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vtrn_s8(a: int8x8_t, b: int8x8_t) -> int8x8x2_t; fn vtrn_s16(a: int16x4_t, b: int16x4_t) -> int16x4x2_t; fn vtrnq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t; fn vtrnq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t; fn vtrnq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t; fn vtrn_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8x2_t; fn vtrn_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4x2_t; fn vtrnq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t; fn vtrnq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t; fn vtrnq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t; fn vtrn_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8x2_t; fn vtrn_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4x2_t; fn vtrnq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t; fn vtrnq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t; fn vtrn_s32(a: int32x2_t, b: int32x2_t) -> int32x2x2_t; fn vtrn_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t; fn vtrn_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t; fn vtrnq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t; fn vzip_s8(a: int8x8_t, b: int8x8_t) -> int8x8x2_t; fn vzip_s16(a: int16x4_t, b: int16x4_t) -> int16x4x2_t; fn vzip_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8x2_t; fn vzip_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4x2_t; fn vzip_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8x2_t; fn vzip_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4x2_t; fn vzip_s32(a: int32x2_t, b: int32x2_t) -> int32x2x2_t; fn vzip_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t; fn vzipq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t; fn vzipq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t; fn vzipq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t; fn vzipq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t; fn vzipq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t; fn vzipq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t; fn vzipq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t; fn vzipq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t; fn vzip_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t; fn vzipq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t; fn vuzp_s8(a: int8x8_t, b: int8x8_t) -> int8x8x2_t; fn vuzp_s16(a: int16x4_t, b: int16x4_t) -> int16x4x2_t; fn vuzpq_s8(a: int8x16_t, b: int8x16_t) -> int8x16x2_t; fn vuzpq_s16(a: int16x8_t, b: int16x8_t) -> int16x8x2_t; fn vuzpq_s32(a: int32x4_t, b: int32x4_t) -> int32x4x2_t; fn vuzp_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8x2_t; fn vuzp_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4x2_t; fn vuzpq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16x2_t; fn vuzpq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8x2_t; fn vuzpq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4x2_t; fn vuzp_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8x2_t; fn vuzp_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4x2_t; fn vuzpq_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16x2_t; fn vuzpq_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8x2_t; fn vuzp_s32(a: int32x2_t, b: int32x2_t) -> int32x2x2_t; fn vuzp_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2x2_t; fn vuzp_f32(a: float32x2_t, b: float32x2_t) -> float32x2x2_t; fn vuzpq_f32(a: float32x4_t, b: float32x4_t) -> float32x4x2_t; fn vabal_u8(a: uint16x8_t, b: uint8x8_t, c: uint8x8_t) -> uint16x8_t; fn vabal_u16(a: uint32x4_t, b: uint16x4_t, c: uint16x4_t) -> uint32x4_t; fn vabal_u32(a: uint64x2_t, b: uint32x2_t, c: uint32x2_t) -> uint64x2_t; fn vabal_s8(a: int16x8_t, b: int8x8_t, c: int8x8_t) -> int16x8_t; fn vabal_s16(a: int32x4_t, b: int16x4_t, c: int16x4_t) -> int32x4_t; fn vabal_s32(a: int64x2_t, b: int32x2_t, c: int32x2_t) -> int64x2_t; fn vqabs_s8(a: int8x8_t) -> int8x8_t; fn vqabsq_s8(a: int8x16_t) -> int8x16_t; fn vqabs_s16(a: int16x4_t) -> int16x4_t; fn vqabsq_s16(a: int16x8_t) -> int16x8_t; fn vqabs_s32(a: int32x2_t) -> int32x2_t; fn vqabsq_s32(a: int32x4_t) -> int32x4_t; unsafe fn vld1_lane_s8(ptr: *const i8, src: int8x8_t) -> int8x8_t; unsafe fn vld1q_lane_s8(ptr: *const i8, src: int8x16_t) -> int8x16_t; unsafe fn vld1_lane_s16(ptr: *const i16, src: int16x4_t) -> int16x4_t; unsafe fn vld1q_lane_s16(ptr: *const i16, src: int16x8_t) -> int16x8_t; unsafe fn vld1_lane_s32(ptr: *const i32, src: int32x2_t) -> int32x2_t; unsafe fn vld1q_lane_s32(ptr: *const i32, src: int32x4_t) -> int32x4_t; unsafe fn vld1_lane_s64(ptr: *const i64, src: int64x1_t) -> int64x1_t; unsafe fn vld1q_lane_s64(ptr: *const i64, src: int64x2_t) -> int64x2_t; unsafe fn vld1_lane_u8(ptr: *const u8, src: uint8x8_t) -> uint8x8_t; unsafe fn vld1q_lane_u8(ptr: *const u8, src: uint8x16_t) -> uint8x16_t; unsafe fn vld1_lane_u16(ptr: *const u16, src: uint16x4_t) -> uint16x4_t; unsafe fn vld1q_lane_u16(ptr: *const u16, src: uint16x8_t) -> uint16x8_t; unsafe fn vld1_lane_u32(ptr: *const u32, src: uint32x2_t) -> uint32x2_t; unsafe fn vld1q_lane_u32(ptr: *const u32, src: uint32x4_t) -> uint32x4_t; unsafe fn vld1_lane_u64(ptr: *const u64, src: uint64x1_t) -> uint64x1_t; unsafe fn vld1q_lane_u64(ptr: *const u64, src: uint64x2_t) -> uint64x2_t; unsafe fn vld1_lane_p8(ptr: *const p8, src: poly8x8_t) -> poly8x8_t; unsafe fn vld1q_lane_p8(ptr: *const p8, src: poly8x16_t) -> poly8x16_t; unsafe fn vld1_lane_p16(ptr: *const p16, src: poly16x4_t) -> poly16x4_t; unsafe fn vld1q_lane_p16(ptr: *const p16, src: poly16x8_t) -> poly16x8_t; unsafe fn vld1_lane_f32(ptr: *const f32, src: float32x2_t) -> float32x2_t; unsafe fn vld1q_lane_f32(ptr: *const f32, src: float32x4_t) -> float32x4_t; unsafe fn vld1_dup_s8(ptr: *const i8) -> int8x8_t; unsafe fn vld1q_dup_s8(ptr: *const i8) -> int8x16_t; unsafe fn vld1_dup_s16(ptr: *const i16) -> int16x4_t; unsafe fn vld1q_dup_s16(ptr: *const i16) -> int16x8_t; unsafe fn vld1_dup_s32(ptr: *const i32) -> int32x2_t; unsafe fn vld1q_dup_s32(ptr: *const i32) -> int32x4_t; unsafe fn vld1_dup_s64(ptr: *const i64) -> int64x1_t; unsafe fn vld1q_dup_s64(ptr: *const i64) -> int64x2_t; unsafe fn vld1_dup_u8(ptr: *const u8) -> uint8x8_t; unsafe fn vld1q_dup_u8(ptr: *const u8) -> uint8x16_t; unsafe fn vld1_dup_u16(ptr: *const u16) -> uint16x4_t; unsafe fn vld1q_dup_u16(ptr: *const u16) -> uint16x8_t; unsafe fn vld1_dup_u32(ptr: *const u32) -> uint32x2_t; unsafe fn vld1q_dup_u32(ptr: *const u32) -> uint32x4_t; unsafe fn vld1_dup_u64(ptr: *const u64) -> uint64x1_t; unsafe fn vld1q_dup_u64(ptr: *const u64) -> uint64x2_t; unsafe fn vld1_dup_p8(ptr: *const p8) -> poly8x8_t; unsafe fn vld1q_dup_p8(ptr: *const p8) -> poly8x16_t; unsafe fn vld1_dup_p16(ptr: *const p16) -> poly16x4_t; unsafe fn vld1q_dup_p16(ptr: *const p16) -> poly16x8_t; unsafe fn vld1_dup_f32(ptr: *const f32) -> float32x2_t; unsafe fn vld1q_dup_f32(ptr: *const f32) -> float32x4_t; fn vaba_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t; fn vaba_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t; fn vaba_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t; fn vaba_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t; fn vaba_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t; fn vaba_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t; fn vabaq_s8(a: int8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t; fn vabaq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; fn vabaq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; fn vabaq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t; fn vabaq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t; fn vabaq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t; fn vabs_s8(a: int8x8_t) -> int8x8_t; fn vabs_s16(a: int16x4_t) -> int16x4_t; fn vabs_s32(a: int32x2_t) -> int32x2_t; fn vabsq_s8(a: int8x16_t) -> int8x16_t; fn vabsq_s16(a: int16x8_t) -> int16x8_t; fn vabsq_s32(a: int32x4_t) -> int32x4_t; fn vpadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vpadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vpadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vpadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vpadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vpadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vadd_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vadd_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vadd_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vadd_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vadd_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vadd_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vadd_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vaddl_s8(a: int8x8_t, b: int8x8_t) -> int16x8_t; fn vaddl_s16(a: int16x4_t, b: int16x4_t) -> int32x4_t; fn vaddl_s32(a: int32x2_t, b: int32x2_t) -> int64x2_t; fn vaddl_u8(a: uint8x8_t, b: uint8x8_t) -> uint16x8_t; fn vaddl_u16(a: uint16x4_t, b: uint16x4_t) -> uint32x4_t; fn vaddl_u32(a: uint32x2_t, b: uint32x2_t) -> uint64x2_t; fn vaddl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t; fn vaddl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t; fn vaddl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t; fn vaddl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t; fn vaddl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t; fn vaddl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t; fn vaddw_s8(a: int16x8_t, b: int8x8_t) -> int16x8_t; fn vaddw_s16(a: int32x4_t, b: int16x4_t) -> int32x4_t; fn vaddw_s32(a: int64x2_t, b: int32x2_t) -> int64x2_t; fn vaddw_u8(a: uint16x8_t, b: uint8x8_t) -> uint16x8_t; fn vaddw_u16(a: uint32x4_t, b: uint16x4_t) -> uint32x4_t; fn vaddw_u32(a: uint64x2_t, b: uint32x2_t) -> uint64x2_t; fn vaddw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t; fn vaddw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t; fn vaddw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t; fn vaddw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t; fn vaddw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t; fn vaddw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t; fn vaddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t; fn vaddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t; fn vaddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t; fn vaddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t; fn vaddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t; fn vaddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t; fn vaddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t; fn vaddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t; fn vaddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t; fn vaddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t; fn vaddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t; fn vaddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t; fn vraddhn_s16(a: int16x8_t, b: int16x8_t) -> int8x8_t; fn vraddhn_s32(a: int32x4_t, b: int32x4_t) -> int16x4_t; fn vraddhn_s64(a: int64x2_t, b: int64x2_t) -> int32x2_t; fn vraddhn_u16(a: uint16x8_t, b: uint16x8_t) -> uint8x8_t; fn vraddhn_u32(a: uint32x4_t, b: uint32x4_t) -> uint16x4_t; fn vraddhn_u64(a: uint64x2_t, b: uint64x2_t) -> uint32x2_t; fn vraddhn_high_s16(r: int8x8_t, a: int16x8_t, b: int16x8_t) -> int8x16_t; fn vraddhn_high_s32(r: int16x4_t, a: int32x4_t, b: int32x4_t) -> int16x8_t; fn vraddhn_high_s64(r: int32x2_t, a: int64x2_t, b: int64x2_t) -> int32x4_t; fn vraddhn_high_u16(r: uint8x8_t, a: uint16x8_t, b: uint16x8_t) -> uint8x16_t; fn vraddhn_high_u32(r: uint16x4_t, a: uint32x4_t, b: uint32x4_t) -> uint16x8_t; fn vraddhn_high_u64(r: uint32x2_t, a: uint64x2_t, b: uint64x2_t) -> uint32x4_t; fn vpaddl_s8(a: int8x8_t) -> int16x4_t; fn vpaddl_s16(a: int16x4_t) -> int32x2_t; fn vpaddl_s32(a: int32x2_t) -> int64x1_t; fn vpaddlq_s8(a: int8x16_t) -> int16x8_t; fn vpaddlq_s16(a: int16x8_t) -> int32x4_t; fn vpaddlq_s32(a: int32x4_t) -> int64x2_t; fn vpaddl_u8(a: uint8x8_t) -> uint16x4_t; fn vpaddl_u16(a: uint16x4_t) -> uint32x2_t; fn vpaddl_u32(a: uint32x2_t) -> uint64x1_t; fn vpaddlq_u8(a: uint8x16_t) -> uint16x8_t; fn vpaddlq_u16(a: uint16x8_t) -> uint32x4_t; fn vpaddlq_u32(a: uint32x4_t) -> uint64x2_t; fn vmovn_s16(a: int16x8_t) -> int8x8_t; fn vmovn_s32(a: int32x4_t) -> int16x4_t; fn vmovn_s64(a: int64x2_t) -> int32x2_t; fn vmovn_u16(a: uint16x8_t) -> uint8x8_t; fn vmovn_u32(a: uint32x4_t) -> uint16x4_t; fn vmovn_u64(a: uint64x2_t) -> uint32x2_t; fn vmovl_s8(a: int8x8_t) -> int16x8_t; fn vmovl_s16(a: int16x4_t) -> int32x4_t; fn vmovl_s32(a: int32x2_t) -> int64x2_t; fn vmovl_u8(a: uint8x8_t) -> uint16x8_t; fn vmovl_u16(a: uint16x4_t) -> uint32x4_t; fn vmovl_u32(a: uint32x2_t) -> uint64x2_t; fn vmvn_s8(a: int8x8_t) -> int8x8_t; fn vmvnq_s8(a: int8x16_t) -> int8x16_t; fn vmvn_s16(a: int16x4_t) -> int16x4_t; fn vmvnq_s16(a: int16x8_t) -> int16x8_t; fn vmvn_s32(a: int32x2_t) -> int32x2_t; fn vmvnq_s32(a: int32x4_t) -> int32x4_t; fn vmvn_u8(a: uint8x8_t) -> uint8x8_t; fn vmvnq_u8(a: uint8x16_t) -> uint8x16_t; fn vmvn_u16(a: uint16x4_t) -> uint16x4_t; fn vmvnq_u16(a: uint16x8_t) -> uint16x8_t; fn vmvn_u32(a: uint32x2_t) -> uint32x2_t; fn vmvnq_u32(a: uint32x4_t) -> uint32x4_t; fn vmvn_p8(a: poly8x8_t) -> poly8x8_t; fn vmvnq_p8(a: poly8x16_t) -> poly8x16_t; fn vbic_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vbicq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vbic_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vbicq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vbic_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vbicq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vbic_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vbicq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vbic_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vbicq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vbic_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vbicq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vbic_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vbicq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vbic_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vbicq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vbsl_s8(a: uint8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t; fn vbsl_s16(a: uint16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t; fn vbsl_s32(a: uint32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t; fn vbsl_s64(a: uint64x1_t, b: int64x1_t, c: int64x1_t) -> int64x1_t; fn vbsl_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t; fn vbsl_u16(a: uint16x4_t, b: uint16x4_t, c: uint16x4_t) -> uint16x4_t; fn vbsl_u32(a: uint32x2_t, b: uint32x2_t, c: uint32x2_t) -> uint32x2_t; fn vbsl_u64(a: uint64x1_t, b: uint64x1_t, c: uint64x1_t) -> uint64x1_t; fn vbsl_f32(a: uint32x2_t, b: float32x2_t, c: float32x2_t) -> float32x2_t; fn vbsl_p8(a: uint8x8_t, b: poly8x8_t, c: poly8x8_t) -> poly8x8_t; fn vbsl_p16(a: uint16x4_t, b: poly16x4_t, c: poly16x4_t) -> poly16x4_t; fn vbslq_s8(a: uint8x16_t, b: int8x16_t, c: int8x16_t) -> int8x16_t; fn vbslq_s16(a: uint16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; fn vbslq_s32(a: uint32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; fn vbslq_s64(a: uint64x2_t, b: int64x2_t, c: int64x2_t) -> int64x2_t; fn vbslq_u8(a: uint8x16_t, b: uint8x16_t, c: uint8x16_t) -> uint8x16_t; fn vbslq_u16(a: uint16x8_t, b: uint16x8_t, c: uint16x8_t) -> uint16x8_t; fn vbslq_u32(a: uint32x4_t, b: uint32x4_t, c: uint32x4_t) -> uint32x4_t; fn vbslq_u64(a: uint64x2_t, b: uint64x2_t, c: uint64x2_t) -> uint64x2_t; fn vbslq_p8(a: uint8x16_t, b: poly8x16_t, c: poly8x16_t) -> poly8x16_t; fn vbslq_p16(a: uint16x8_t, b: poly16x8_t, c: poly16x8_t) -> poly16x8_t; fn vbslq_f32(a: uint32x4_t, b: float32x4_t, c: float32x4_t) -> float32x4_t; fn vorn_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vornq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vorn_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vornq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vorn_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vornq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vorn_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vornq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vorn_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vornq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vorn_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vornq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vorn_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vornq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vorn_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vornq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vpmin_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vpmin_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vpmin_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vpmin_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vpmin_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vpmin_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vpmin_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vpmax_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vpmax_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vpmax_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vpmax_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vpmax_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vpmax_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vpmax_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vgetq_lane_u64(v: uint64x2_t) -> u64; fn vget_lane_u64(v: uint64x1_t) -> u64; fn vget_lane_u16(v: uint16x4_t) -> u16; fn vget_lane_s16(v: int16x4_t) -> i16; fn vget_lane_p16(v: poly16x4_t) -> p16; fn vget_lane_u32(v: uint32x2_t) -> u32; fn vget_lane_s32(v: int32x2_t) -> i32; fn vget_lane_f32(v: float32x2_t) -> f32; fn vgetq_lane_f32(v: float32x4_t) -> f32; fn vget_lane_p64(v: poly64x1_t) -> p64; fn vgetq_lane_p64(v: poly64x2_t) -> p64; fn vget_lane_s64(v: int64x1_t) -> i64; fn vgetq_lane_s64(v: int64x2_t) -> i64; fn vgetq_lane_u16(v: uint16x8_t) -> u16; fn vgetq_lane_u32(v: uint32x4_t) -> u32; fn vgetq_lane_s16(v: int16x8_t) -> i16; fn vgetq_lane_p16(v: poly16x8_t) -> p16; fn vgetq_lane_s32(v: int32x4_t) -> i32; fn vget_lane_u8(v: uint8x8_t) -> u8; fn vget_lane_s8(v: int8x8_t) -> i8; fn vget_lane_p8(v: poly8x8_t) -> p8; fn vgetq_lane_u8(v: uint8x16_t) -> u8; fn vgetq_lane_s8(v: int8x16_t) -> i8; fn vgetq_lane_p8(v: poly8x16_t) -> p8; fn vget_high_s8(a: int8x16_t) -> int8x8_t; fn vget_high_s16(a: int16x8_t) -> int16x4_t; fn vget_high_s32(a: int32x4_t) -> int32x2_t; fn vget_high_s64(a: int64x2_t) -> int64x1_t; fn vget_high_u8(a: uint8x16_t) -> uint8x8_t; fn vget_high_u16(a: uint16x8_t) -> uint16x4_t; fn vget_high_u32(a: uint32x4_t) -> uint32x2_t; fn vget_high_u64(a: uint64x2_t) -> uint64x1_t; fn vget_high_p8(a: poly8x16_t) -> poly8x8_t; fn vget_high_p16(a: poly16x8_t) -> poly16x4_t; fn vget_high_f32(a: float32x4_t) -> float32x2_t; fn vget_low_s8(a: int8x16_t) -> int8x8_t; fn vget_low_s16(a: int16x8_t) -> int16x4_t; fn vget_low_s32(a: int32x4_t) -> int32x2_t; fn vget_low_s64(a: int64x2_t) -> int64x1_t; fn vget_low_u8(a: uint8x16_t) -> uint8x8_t; fn vget_low_u16(a: uint16x8_t) -> uint16x4_t; fn vget_low_u32(a: uint32x4_t) -> uint32x2_t; fn vget_low_u64(a: uint64x2_t) -> uint64x1_t; fn vget_low_p8(a: poly8x16_t) -> poly8x8_t; fn vget_low_p16(a: poly16x8_t) -> poly16x4_t; fn vget_low_f32(a: float32x4_t) -> float32x2_t; fn vdupq_n_s8(value: i8) -> int8x16_t; fn vdupq_n_s16(value: i16) -> int16x8_t; fn vdupq_n_s32(value: i32) -> int32x4_t; fn vdupq_n_s64(value: i64) -> int64x2_t; fn vdupq_n_u8(value: u8) -> uint8x16_t; fn vdupq_n_u16(value: u16) -> uint16x8_t; fn vdupq_n_u32(value: u32) -> uint32x4_t; fn vdupq_n_u64(value: u64) -> uint64x2_t; fn vdupq_n_p8(value: p8) -> poly8x16_t; fn vdupq_n_p16(value: p16) -> poly16x8_t; fn vdupq_n_f32(value: f32) -> float32x4_t; fn vdup_n_s8(value: i8) -> int8x8_t; fn vdup_n_s16(value: i16) -> int16x4_t; fn vdup_n_s32(value: i32) -> int32x2_t; fn vdup_n_s64(value: i64) -> int64x1_t; fn vdup_n_u8(value: u8) -> uint8x8_t; fn vdup_n_u16(value: u16) -> uint16x4_t; fn vdup_n_u32(value: u32) -> uint32x2_t; fn vdup_n_u64(value: u64) -> uint64x1_t; fn vdup_n_p8(value: p8) -> poly8x8_t; fn vdup_n_p16(value: p16) -> poly16x4_t; fn vdup_n_f32(value: f32) -> float32x2_t; unsafe fn vldrq_p128(a: *const p128) -> p128; unsafe fn vstrq_p128(a: *mut p128, b: p128); fn vmov_n_s8(value: i8) -> int8x8_t; fn vmov_n_s16(value: i16) -> int16x4_t; fn vmov_n_s32(value: i32) -> int32x2_t; fn vmov_n_s64(value: i64) -> int64x1_t; fn vmov_n_u8(value: u8) -> uint8x8_t; fn vmov_n_u16(value: u16) -> uint16x4_t; fn vmov_n_u32(value: u32) -> uint32x2_t; fn vmov_n_u64(value: u64) -> uint64x1_t; fn vmov_n_p8(value: p8) -> poly8x8_t; fn vmov_n_p16(value: p16) -> poly16x4_t; fn vmov_n_f32(value: f32) -> float32x2_t; fn vmovq_n_s8(value: i8) -> int8x16_t; fn vmovq_n_s16(value: i16) -> int16x8_t; fn vmovq_n_s32(value: i32) -> int32x4_t; fn vmovq_n_s64(value: i64) -> int64x2_t; fn vmovq_n_u8(value: u8) -> uint8x16_t; fn vmovq_n_u16(value: u16) -> uint16x8_t; fn vmovq_n_u32(value: u32) -> uint32x4_t; fn vmovq_n_u64(value: u64) -> uint64x2_t; fn vmovq_n_p8(value: p8) -> poly8x16_t; fn vmovq_n_p16(value: p16) -> poly16x8_t; fn vmovq_n_f32(value: f32) -> float32x4_t; fn vext_s64(a: int64x1_t, _b: int64x1_t) -> int64x1_t; fn vext_u64(a: uint64x1_t, _b: uint64x1_t) -> uint64x1_t; fn vcnt_s8(a: int8x8_t) -> int8x8_t; fn vcntq_s8(a: int8x16_t) -> int8x16_t; fn vcnt_u8(a: uint8x8_t) -> uint8x8_t; fn vcntq_u8(a: uint8x16_t) -> uint8x16_t; fn vcnt_p8(a: poly8x8_t) -> poly8x8_t; fn vcntq_p8(a: poly8x16_t) -> poly8x16_t; fn vrev16_s8(a: int8x8_t) -> int8x8_t; fn vrev16q_s8(a: int8x16_t) -> int8x16_t; fn vrev16_u8(a: uint8x8_t) -> uint8x8_t; fn vrev16q_u8(a: uint8x16_t) -> uint8x16_t; fn vrev16_p8(a: poly8x8_t) -> poly8x8_t; fn vrev16q_p8(a: poly8x16_t) -> poly8x16_t; fn vrev32_s8(a: int8x8_t) -> int8x8_t; fn vrev32q_s8(a: int8x16_t) -> int8x16_t; fn vrev32_u8(a: uint8x8_t) -> uint8x8_t; fn vrev32q_u8(a: uint8x16_t) -> uint8x16_t; fn vrev32_s16(a: int16x4_t) -> int16x4_t; fn vrev32q_s16(a: int16x8_t) -> int16x8_t; fn vrev32_p16(a: poly16x4_t) -> poly16x4_t; fn vrev32q_p16(a: poly16x8_t) -> poly16x8_t; fn vrev32_u16(a: uint16x4_t) -> uint16x4_t; fn vrev32q_u16(a: uint16x8_t) -> uint16x8_t; fn vrev32_p8(a: poly8x8_t) -> poly8x8_t; fn vrev32q_p8(a: poly8x16_t) -> poly8x16_t; fn vrev64_s8(a: int8x8_t) -> int8x8_t; fn vrev64q_s8(a: int8x16_t) -> int8x16_t; fn vrev64_s16(a: int16x4_t) -> int16x4_t; fn vrev64q_s16(a: int16x8_t) -> int16x8_t; fn vrev64_s32(a: int32x2_t) -> int32x2_t; fn vrev64q_s32(a: int32x4_t) -> int32x4_t; fn vrev64_u8(a: uint8x8_t) -> uint8x8_t; fn vrev64q_u8(a: uint8x16_t) -> uint8x16_t; fn vrev64_u16(a: uint16x4_t) -> uint16x4_t; fn vrev64q_u16(a: uint16x8_t) -> uint16x8_t; fn vrev64_u32(a: uint32x2_t) -> uint32x2_t; fn vrev64q_u32(a: uint32x4_t) -> uint32x4_t; fn vrev64_f32(a: float32x2_t) -> float32x2_t; fn vrev64q_f32(a: float32x4_t) -> float32x4_t; fn vrev64_p8(a: poly8x8_t) -> poly8x8_t; fn vrev64q_p8(a: poly8x16_t) -> poly8x16_t; fn vrev64_p16(a: poly16x4_t) -> poly16x4_t; fn vrev64q_p16(a: poly16x8_t) -> poly16x8_t; fn vpadal_s8(a: int16x4_t, b: int8x8_t) -> int16x4_t; fn vpadal_s16(a: int32x2_t, b: int16x4_t) -> int32x2_t; fn vpadal_s32(a: int64x1_t, b: int32x2_t) -> int64x1_t; fn vpadalq_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t; fn vpadalq_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t; fn vpadalq_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t; fn vpadal_u8(a: uint16x4_t, b: uint8x8_t) -> uint16x4_t; fn vpadal_u16(a: uint32x2_t, b: uint16x4_t) -> uint32x2_t; fn vpadal_u32(a: uint64x1_t, b: uint32x2_t) -> uint64x1_t; fn vpadalq_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t; fn vpadalq_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t; fn vpadalq_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t; fn vcombine_f32(low: float32x2_t, high: float32x2_t) -> float32x4_t; fn vcombine_p8(low: poly8x8_t, high: poly8x8_t) -> poly8x16_t; fn vcombine_p16(low: poly16x4_t, high: poly16x4_t) -> poly16x8_t; fn vcombine_s8(low: int8x8_t, high: int8x8_t) -> int8x16_t; fn vcombine_s16(low: int16x4_t, high: int16x4_t) -> int16x8_t; fn vcombine_s32(low: int32x2_t, high: int32x2_t) -> int32x4_t; fn vcombine_s64(low: int64x1_t, high: int64x1_t) -> int64x2_t; fn vcombine_u8(low: uint8x8_t, high: uint8x8_t) -> uint8x16_t; fn vcombine_u16(low: uint16x4_t, high: uint16x4_t) -> uint16x8_t; fn vcombine_u32(low: uint32x2_t, high: uint32x2_t) -> uint32x4_t; fn vcombine_u64(low: uint64x1_t, high: uint64x1_t) -> uint64x2_t; fn vcombine_p64(low: poly64x1_t, high: poly64x1_t) -> poly64x2_t; fn vabd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vabdq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vabds_f32(a: f32, b: f32) -> f32; fn vabdd_f64(a: f64, b: f64) -> f64; fn vabdl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t; fn vabdl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t; fn vabdl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t; fn vabdl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t; fn vabdl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t; fn vabdl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t; fn vceq_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vceqq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vceq_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t; fn vceqq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t; fn vceq_p64(a: poly64x1_t, b: poly64x1_t) -> uint64x1_t; fn vceqq_p64(a: poly64x2_t, b: poly64x2_t) -> uint64x2_t; fn vceq_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t; fn vceqq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t; fn vceqd_s64(a: i64, b: i64) -> u64; fn vceqd_u64(a: u64, b: u64) -> u64; fn vceqs_f32(a: f32, b: f32) -> u32; fn vceqd_f64(a: f64, b: f64) -> u64; fn vceqz_s8(a: int8x8_t) -> uint8x8_t; fn vceqzq_s8(a: int8x16_t) -> uint8x16_t; fn vceqz_s16(a: int16x4_t) -> uint16x4_t; fn vceqzq_s16(a: int16x8_t) -> uint16x8_t; fn vceqz_s32(a: int32x2_t) -> uint32x2_t; fn vceqzq_s32(a: int32x4_t) -> uint32x4_t; fn vceqz_s64(a: int64x1_t) -> uint64x1_t; fn vceqzq_s64(a: int64x2_t) -> uint64x2_t; fn vceqz_p8(a: poly8x8_t) -> uint8x8_t; fn vceqzq_p8(a: poly8x16_t) -> uint8x16_t; fn vceqz_p64(a: poly64x1_t) -> uint64x1_t; fn vceqzq_p64(a: poly64x2_t) -> uint64x2_t; fn vceqz_u8(a: uint8x8_t) -> uint8x8_t; fn vceqzq_u8(a: uint8x16_t) -> uint8x16_t; fn vceqz_u16(a: uint16x4_t) -> uint16x4_t; fn vceqzq_u16(a: uint16x8_t) -> uint16x8_t; fn vceqz_u32(a: uint32x2_t) -> uint32x2_t; fn vceqzq_u32(a: uint32x4_t) -> uint32x4_t; fn vceqz_u64(a: uint64x1_t) -> uint64x1_t; fn vceqzq_u64(a: uint64x2_t) -> uint64x2_t; fn vceqz_f32(a: float32x2_t) -> uint32x2_t; fn vceqzq_f32(a: float32x4_t) -> uint32x4_t; fn vceqz_f64(a: float64x1_t) -> uint64x1_t; fn vceqzq_f64(a: float64x2_t) -> uint64x2_t; fn vceqzd_s64(a: i64) -> u64; fn vceqzd_u64(a: u64) -> u64; fn vceqzs_f32(a: f32) -> u32; fn vceqzd_f64(a: f64) -> u64; fn vtst_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t; fn vtstq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t; fn vtst_p64(a: poly64x1_t, b: poly64x1_t) -> uint64x1_t; fn vtstq_p64(a: poly64x2_t, b: poly64x2_t) -> uint64x2_t; fn vtst_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vtstq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vtstd_s64(a: i64, b: i64) -> u64; fn vtstd_u64(a: u64, b: u64) -> u64; fn vuqadds_s32(a: i32, b: u32) -> i32; fn vuqaddd_s64(a: i64, b: u64) -> i64; fn vuqaddb_s8(a: i8, b: u8) -> i8; fn vuqaddh_s16(a: i16, b: u16) -> i16; fn vabs_f64(a: float64x1_t) -> float64x1_t; fn vabsq_f64(a: float64x2_t) -> float64x2_t; fn vcgt_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t; fn vcgtq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t; fn vcgt_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vcgtq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vcgt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t; fn vcgtq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t; fn vcgtd_s64(a: i64, b: i64) -> u64; fn vcgtd_u64(a: u64, b: u64) -> u64; fn vcgts_f32(a: f32, b: f32) -> u32; fn vcgtd_f64(a: f64, b: f64) -> u64; fn vclt_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t; fn vcltq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t; fn vclt_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vcltq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vclt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t; fn vcltq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t; fn vcltd_s64(a: i64, b: i64) -> u64; fn vcltd_u64(a: u64, b: u64) -> u64; fn vclts_f32(a: f32, b: f32) -> u32; fn vcltd_f64(a: f64, b: f64) -> u64; fn vcle_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t; fn vcleq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t; fn vcged_s64(a: i64, b: i64) -> u64; fn vcged_u64(a: u64, b: u64) -> u64; fn vcges_f32(a: f32, b: f32) -> u32; fn vcged_f64(a: f64, b: f64) -> u64; fn vcle_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vcleq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vcle_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t; fn vcleq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t; fn vcled_s64(a: i64, b: i64) -> u64; fn vcled_u64(a: u64, b: u64) -> u64; fn vcles_f32(a: f32, b: f32) -> u32; fn vcled_f64(a: f64, b: f64) -> u64; fn vcge_s64(a: int64x1_t, b: int64x1_t) -> uint64x1_t; fn vcgeq_s64(a: int64x2_t, b: int64x2_t) -> uint64x2_t; fn vcge_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vcgeq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vcge_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t; fn vcgeq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t; fn vcgez_s8(a: int8x8_t) -> uint8x8_t; fn vcgezq_s8(a: int8x16_t) -> uint8x16_t; fn vcgez_s16(a: int16x4_t) -> uint16x4_t; fn vcgezq_s16(a: int16x8_t) -> uint16x8_t; fn vcgez_s32(a: int32x2_t) -> uint32x2_t; fn vcgezq_s32(a: int32x4_t) -> uint32x4_t; fn vcgez_s64(a: int64x1_t) -> uint64x1_t; fn vcgezq_s64(a: int64x2_t) -> uint64x2_t; fn vcgez_f32(a: float32x2_t) -> uint32x2_t; fn vcgezq_f32(a: float32x4_t) -> uint32x4_t; fn vcgez_f64(a: float64x1_t) -> uint64x1_t; fn vcgezq_f64(a: float64x2_t) -> uint64x2_t; fn vcgezd_s64(a: i64) -> u64; fn vcgezs_f32(a: f32) -> u32; fn vcgezd_f64(a: f64) -> u64; fn vcgtz_s8(a: int8x8_t) -> uint8x8_t; fn vcgtzq_s8(a: int8x16_t) -> uint8x16_t; fn vcgtz_s16(a: int16x4_t) -> uint16x4_t; fn vcgtzq_s16(a: int16x8_t) -> uint16x8_t; fn vcgtz_s32(a: int32x2_t) -> uint32x2_t; fn vcgtzq_s32(a: int32x4_t) -> uint32x4_t; fn vcgtz_s64(a: int64x1_t) -> uint64x1_t; fn vcgtzq_s64(a: int64x2_t) -> uint64x2_t; fn vcgtz_f32(a: float32x2_t) -> uint32x2_t; fn vcgtzq_f32(a: float32x4_t) -> uint32x4_t; fn vcgtz_f64(a: float64x1_t) -> uint64x1_t; fn vcgtzq_f64(a: float64x2_t) -> uint64x2_t; fn vcgtzd_s64(a: i64) -> u64; fn vcgtzs_f32(a: f32) -> u32; fn vcgtzd_f64(a: f64) -> u64; fn vclez_s8(a: int8x8_t) -> uint8x8_t; fn vclezq_s8(a: int8x16_t) -> uint8x16_t; fn vclez_s16(a: int16x4_t) -> uint16x4_t; fn vclezq_s16(a: int16x8_t) -> uint16x8_t; fn vclez_s32(a: int32x2_t) -> uint32x2_t; fn vclezq_s32(a: int32x4_t) -> uint32x4_t; fn vclez_s64(a: int64x1_t) -> uint64x1_t; fn vclezq_s64(a: int64x2_t) -> uint64x2_t; fn vclez_f32(a: float32x2_t) -> uint32x2_t; fn vclezq_f32(a: float32x4_t) -> uint32x4_t; fn vclez_f64(a: float64x1_t) -> uint64x1_t; fn vclezq_f64(a: float64x2_t) -> uint64x2_t; fn vclezd_s64(a: i64) -> u64; fn vclezs_f32(a: f32) -> u32; fn vclezd_f64(a: f64) -> u64; fn vcltz_s8(a: int8x8_t) -> uint8x8_t; fn vcltzq_s8(a: int8x16_t) -> uint8x16_t; fn vcltz_s16(a: int16x4_t) -> uint16x4_t; fn vcltzq_s16(a: int16x8_t) -> uint16x8_t; fn vcltz_s32(a: int32x2_t) -> uint32x2_t; fn vcltzq_s32(a: int32x4_t) -> uint32x4_t; fn vcltz_s64(a: int64x1_t) -> uint64x1_t; fn vcltzq_s64(a: int64x2_t) -> uint64x2_t; fn vcltz_f32(a: float32x2_t) -> uint32x2_t; fn vcltzq_f32(a: float32x4_t) -> uint32x4_t; fn vcltz_f64(a: float64x1_t) -> uint64x1_t; fn vcltzq_f64(a: float64x2_t) -> uint64x2_t; fn vcltzd_s64(a: i64) -> u64; fn vcltzs_f32(a: f32) -> u32; fn vcltzd_f64(a: f64) -> u64; fn vcagt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t; fn vcagtq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t; fn vcagts_f32(a: f32, b: f32) -> u32; fn vcagtd_f64(a: f64, b: f64) -> u64; fn vcage_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t; fn vcageq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t; fn vcages_f32(a: f32, b: f32) -> u32; fn vcaged_f64(a: f64, b: f64) -> u64; fn vcalt_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t; fn vcaltq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t; fn vcalts_f32(a: f32, b: f32) -> u32; fn vcaltd_f64(a: f64, b: f64) -> u64; fn vcale_f64(a: float64x1_t, b: float64x1_t) -> uint64x1_t; fn vcaleq_f64(a: float64x2_t, b: float64x2_t) -> uint64x2_t; fn vcales_f32(a: f32, b: f32) -> u32; fn vcaled_f64(a: f64, b: f64) -> u64; fn vcopy_lane_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vcopyq_laneq_s8( a: int8x16_t, b: int8x16_t, ) -> int8x16_t; fn vcopy_lane_s16( a: int16x4_t, b: int16x4_t, ) -> int16x4_t; fn vcopyq_laneq_s16( a: int16x8_t, b: int16x8_t, ) -> int16x8_t; fn vcopy_lane_s32( a: int32x2_t, b: int32x2_t, ) -> int32x2_t; fn vcopyq_laneq_s32( a: int32x4_t, b: int32x4_t, ) -> int32x4_t; fn vcopyq_laneq_s64( a: int64x2_t, b: int64x2_t, ) -> int64x2_t; fn vcopy_lane_u8( a: uint8x8_t, b: uint8x8_t, ) -> uint8x8_t; fn vcopyq_laneq_u8( a: uint8x16_t, b: uint8x16_t, ) -> uint8x16_t; fn vcopy_lane_u16( a: uint16x4_t, b: uint16x4_t, ) -> uint16x4_t; fn vcopyq_laneq_u16( a: uint16x8_t, b: uint16x8_t, ) -> uint16x8_t; fn vcopy_lane_u32( a: uint32x2_t, b: uint32x2_t, ) -> uint32x2_t; fn vcopyq_laneq_u32( a: uint32x4_t, b: uint32x4_t, ) -> uint32x4_t; fn vcopyq_laneq_u64( a: uint64x2_t, b: uint64x2_t, ) -> uint64x2_t; fn vcopy_lane_p8( a: poly8x8_t, b: poly8x8_t, ) -> poly8x8_t; fn vcopyq_laneq_p8( a: poly8x16_t, b: poly8x16_t, ) -> poly8x16_t; fn vcopy_lane_p16( a: poly16x4_t, b: poly16x4_t, ) -> poly16x4_t; fn vcopyq_laneq_p16( a: poly16x8_t, b: poly16x8_t, ) -> poly16x8_t; fn vcopyq_laneq_p64( a: poly64x2_t, b: poly64x2_t, ) -> poly64x2_t; fn vcopy_lane_f32( a: float32x2_t, b: float32x2_t, ) -> float32x2_t; fn vcopyq_laneq_f32( a: float32x4_t, b: float32x4_t, ) -> float32x4_t; fn vcopyq_laneq_f64( a: float64x2_t, b: float64x2_t, ) -> float64x2_t; fn vcopy_laneq_s8( a: int8x8_t, b: int8x16_t, ) -> int8x8_t; fn vcopy_laneq_s16( a: int16x4_t, b: int16x8_t, ) -> int16x4_t; fn vcopy_laneq_s32( a: int32x2_t, b: int32x4_t, ) -> int32x2_t; fn vcopy_laneq_u8( a: uint8x8_t, b: uint8x16_t, ) -> uint8x8_t; fn vcopy_laneq_u16( a: uint16x4_t, b: uint16x8_t, ) -> uint16x4_t; fn vcopy_laneq_u32( a: uint32x2_t, b: uint32x4_t, ) -> uint32x2_t; fn vcopy_laneq_p8( a: poly8x8_t, b: poly8x16_t, ) -> poly8x8_t; fn vcopy_laneq_p16( a: poly16x4_t, b: poly16x8_t, ) -> poly16x4_t; fn vcopy_laneq_f32( a: float32x2_t, b: float32x4_t, ) -> float32x2_t; fn vcopyq_lane_s8( a: int8x16_t, b: int8x8_t, ) -> int8x16_t; fn vcopyq_lane_s16( a: int16x8_t, b: int16x4_t, ) -> int16x8_t; fn vcopyq_lane_s32( a: int32x4_t, b: int32x2_t, ) -> int32x4_t; fn vcopyq_lane_u8( a: uint8x16_t, b: uint8x8_t, ) -> uint8x16_t; fn vcopyq_lane_u16( a: uint16x8_t, b: uint16x4_t, ) -> uint16x8_t; fn vcopyq_lane_u32( a: uint32x4_t, b: uint32x2_t, ) -> uint32x4_t; fn vcopyq_lane_p8( a: poly8x16_t, b: poly8x8_t, ) -> poly8x16_t; fn vcopyq_lane_p16( a: poly16x8_t, b: poly16x4_t, ) -> poly16x8_t; fn vcopyq_lane_s64( a: int64x2_t, b: int64x1_t, ) -> int64x2_t; fn vcopyq_lane_u64( a: uint64x2_t, b: uint64x1_t, ) -> uint64x2_t; fn vcopyq_lane_p64( a: poly64x2_t, b: poly64x1_t, ) -> poly64x2_t; fn vcopyq_lane_f32( a: float32x4_t, b: float32x2_t, ) -> float32x4_t; fn vcopyq_lane_f64( a: float64x2_t, b: float64x1_t, ) -> float64x2_t; fn vcreate_f64(a: u64) -> float64x1_t; fn vcvt_f64_s64(a: int64x1_t) -> float64x1_t; fn vcvtq_f64_s64(a: int64x2_t) -> float64x2_t; fn vcvt_f64_u64(a: uint64x1_t) -> float64x1_t; fn vcvtq_f64_u64(a: uint64x2_t) -> float64x2_t; fn vcvt_f64_f32(a: float32x2_t) -> float64x2_t; fn vcvt_high_f64_f32(a: float32x4_t) -> float64x2_t; fn vcvt_f32_f64(a: float64x2_t) -> float32x2_t; fn vcvt_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t; fn vcvtx_f32_f64(a: float64x2_t) -> float32x2_t; fn vcvtxd_f32_f64(a: f64) -> f32; fn vcvtx_high_f32_f64(a: float32x2_t, b: float64x2_t) -> float32x4_t; fn vcvt_n_f64_s64(a: int64x1_t) -> float64x1_t; fn vcvtq_n_f64_s64(a: int64x2_t) -> float64x2_t; fn vcvts_n_f32_s32(a: i32) -> f32; fn vcvtd_n_f64_s64(a: i64) -> f64; fn vcvt_n_f64_u64(a: uint64x1_t) -> float64x1_t; fn vcvtq_n_f64_u64(a: uint64x2_t) -> float64x2_t; fn vcvts_n_f32_u32(a: u32) -> f32; fn vcvtd_n_f64_u64(a: u64) -> f64; fn vcvt_n_s64_f64(a: float64x1_t) -> int64x1_t; fn vcvtq_n_s64_f64(a: float64x2_t) -> int64x2_t; fn vcvts_n_s32_f32(a: f32) -> i32; fn vcvtd_n_s64_f64(a: f64) -> i64; fn vcvt_n_u64_f64(a: float64x1_t) -> uint64x1_t; fn vcvtq_n_u64_f64(a: float64x2_t) -> uint64x2_t; fn vcvts_n_u32_f32(a: f32) -> u32; fn vcvtd_n_u64_f64(a: f64) -> u64; fn vcvts_f32_s32(a: i32) -> f32; fn vcvtd_f64_s64(a: i64) -> f64; fn vcvts_f32_u32(a: u32) -> f32; fn vcvtd_f64_u64(a: u64) -> f64; fn vcvts_s32_f32(a: f32) -> i32; fn vcvtd_s64_f64(a: f64) -> i64; fn vcvts_u32_f32(a: f32) -> u32; fn vcvtd_u64_f64(a: f64) -> u64; fn vcvt_s64_f64(a: float64x1_t) -> int64x1_t; fn vcvtq_s64_f64(a: float64x2_t) -> int64x2_t; fn vcvt_u64_f64(a: float64x1_t) -> uint64x1_t; fn vcvtq_u64_f64(a: float64x2_t) -> uint64x2_t; fn vcvta_s32_f32(a: float32x2_t) -> int32x2_t; fn vcvtaq_s32_f32(a: float32x4_t) -> int32x4_t; fn vcvta_s64_f64(a: float64x1_t) -> int64x1_t; fn vcvtaq_s64_f64(a: float64x2_t) -> int64x2_t; fn vcvtas_s32_f32(a: f32) -> i32; fn vcvtad_s64_f64(a: f64) -> i64; fn vcvtas_u32_f32(a: f32) -> u32; fn vcvtad_u64_f64(a: f64) -> u64; fn vcvtn_s32_f32(a: float32x2_t) -> int32x2_t; fn vcvtnq_s32_f32(a: float32x4_t) -> int32x4_t; fn vcvtn_s64_f64(a: float64x1_t) -> int64x1_t; fn vcvtnq_s64_f64(a: float64x2_t) -> int64x2_t; fn vcvtns_s32_f32(a: f32) -> i32; fn vcvtnd_s64_f64(a: f64) -> i64; fn vcvtm_s32_f32(a: float32x2_t) -> int32x2_t; fn vcvtmq_s32_f32(a: float32x4_t) -> int32x4_t; fn vcvtm_s64_f64(a: float64x1_t) -> int64x1_t; fn vcvtmq_s64_f64(a: float64x2_t) -> int64x2_t; fn vcvtms_s32_f32(a: f32) -> i32; fn vcvtmd_s64_f64(a: f64) -> i64; fn vcvtp_s32_f32(a: float32x2_t) -> int32x2_t; fn vcvtpq_s32_f32(a: float32x4_t) -> int32x4_t; fn vcvtp_s64_f64(a: float64x1_t) -> int64x1_t; fn vcvtpq_s64_f64(a: float64x2_t) -> int64x2_t; fn vcvtps_s32_f32(a: f32) -> i32; fn vcvtpd_s64_f64(a: f64) -> i64; fn vcvta_u32_f32(a: float32x2_t) -> uint32x2_t; fn vcvtaq_u32_f32(a: float32x4_t) -> uint32x4_t; fn vcvta_u64_f64(a: float64x1_t) -> uint64x1_t; fn vcvtaq_u64_f64(a: float64x2_t) -> uint64x2_t; fn vcvtn_u32_f32(a: float32x2_t) -> uint32x2_t; fn vcvtnq_u32_f32(a: float32x4_t) -> uint32x4_t; fn vcvtn_u64_f64(a: float64x1_t) -> uint64x1_t; fn vcvtnq_u64_f64(a: float64x2_t) -> uint64x2_t; fn vcvtns_u32_f32(a: f32) -> u32; fn vcvtnd_u64_f64(a: f64) -> u64; fn vcvtm_u32_f32(a: float32x2_t) -> uint32x2_t; fn vcvtmq_u32_f32(a: float32x4_t) -> uint32x4_t; fn vcvtm_u64_f64(a: float64x1_t) -> uint64x1_t; fn vcvtmq_u64_f64(a: float64x2_t) -> uint64x2_t; fn vcvtms_u32_f32(a: f32) -> u32; fn vcvtmd_u64_f64(a: f64) -> u64; fn vcvtp_u32_f32(a: float32x2_t) -> uint32x2_t; fn vcvtpq_u32_f32(a: float32x4_t) -> uint32x4_t; fn vcvtp_u64_f64(a: float64x1_t) -> uint64x1_t; fn vcvtpq_u64_f64(a: float64x2_t) -> uint64x2_t; fn vcvtps_u32_f32(a: f32) -> u32; fn vcvtpd_u64_f64(a: f64) -> u64; fn vdupq_laneq_p64(a: poly64x2_t) -> poly64x2_t; fn vdupq_lane_p64(a: poly64x1_t) -> poly64x2_t; fn vdupq_laneq_f64(a: float64x2_t) -> float64x2_t; fn vdupq_lane_f64(a: float64x1_t) -> float64x2_t; fn vdup_lane_p64(a: poly64x1_t) -> poly64x1_t; fn vdup_lane_f64(a: float64x1_t) -> float64x1_t; fn vdup_laneq_p64(a: poly64x2_t) -> poly64x1_t; fn vdup_laneq_f64(a: float64x2_t) -> float64x1_t; fn vdupb_lane_s8(a: int8x8_t) -> i8; fn vdupb_laneq_s8(a: int8x16_t) -> i8; fn vduph_lane_s16(a: int16x4_t) -> i16; fn vduph_laneq_s16(a: int16x8_t) -> i16; fn vdups_lane_s32(a: int32x2_t) -> i32; fn vdups_laneq_s32(a: int32x4_t) -> i32; fn vdupd_lane_s64(a: int64x1_t) -> i64; fn vdupd_laneq_s64(a: int64x2_t) -> i64; fn vdupb_lane_u8(a: uint8x8_t) -> u8; fn vdupb_laneq_u8(a: uint8x16_t) -> u8; fn vduph_lane_u16(a: uint16x4_t) -> u16; fn vduph_laneq_u16(a: uint16x8_t) -> u16; fn vdups_lane_u32(a: uint32x2_t) -> u32; fn vdups_laneq_u32(a: uint32x4_t) -> u32; fn vdupd_lane_u64(a: uint64x1_t) -> u64; fn vdupd_laneq_u64(a: uint64x2_t) -> u64; fn vdupb_lane_p8(a: poly8x8_t) -> p8; fn vdupb_laneq_p8(a: poly8x16_t) -> p8; fn vduph_lane_p16(a: poly16x4_t) -> p16; fn vduph_laneq_p16(a: poly16x8_t) -> p16; fn vdups_lane_f32(a: float32x2_t) -> f32; fn vdups_laneq_f32(a: float32x4_t) -> f32; fn vdupd_lane_f64(a: float64x1_t) -> f64; fn vdupd_laneq_f64(a: float64x2_t) -> f64; fn vextq_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t; fn vextq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vmla_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t; fn vmlaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t; fn vmlal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t; fn vmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t; fn vmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t; fn vmlal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t; fn vmlal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t; fn vmlal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t; fn vmlal_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t; fn vmlal_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t; fn vmlal_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t; fn vmlal_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t; fn vmlal_high_lane_s16( a: int32x4_t, b: int16x8_t, c: int16x4_t, ) -> int32x4_t; fn vmlal_high_laneq_s16( a: int32x4_t, b: int16x8_t, c: int16x8_t, ) -> int32x4_t; fn vmlal_high_lane_s32( a: int64x2_t, b: int32x4_t, c: int32x2_t, ) -> int64x2_t; fn vmlal_high_laneq_s32( a: int64x2_t, b: int32x4_t, c: int32x4_t, ) -> int64x2_t; fn vmlal_high_lane_u16( a: uint32x4_t, b: uint16x8_t, c: uint16x4_t, ) -> uint32x4_t; fn vmlal_high_laneq_u16( a: uint32x4_t, b: uint16x8_t, c: uint16x8_t, ) -> uint32x4_t; fn vmlal_high_lane_u32( a: uint64x2_t, b: uint32x4_t, c: uint32x2_t, ) -> uint64x2_t; fn vmlal_high_laneq_u32( a: uint64x2_t, b: uint32x4_t, c: uint32x4_t, ) -> uint64x2_t; fn vmls_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t; fn vmlsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t; fn vmlsl_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t; fn vmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t; fn vmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t; fn vmlsl_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t; fn vmlsl_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t; fn vmlsl_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t; fn vmlsl_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t; fn vmlsl_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t; fn vmlsl_high_n_u16(a: uint32x4_t, b: uint16x8_t, c: u16) -> uint32x4_t; fn vmlsl_high_n_u32(a: uint64x2_t, b: uint32x4_t, c: u32) -> uint64x2_t; fn vmlsl_high_lane_s16( a: int32x4_t, b: int16x8_t, c: int16x4_t, ) -> int32x4_t; fn vmlsl_high_laneq_s16( a: int32x4_t, b: int16x8_t, c: int16x8_t, ) -> int32x4_t; fn vmlsl_high_lane_s32( a: int64x2_t, b: int32x4_t, c: int32x2_t, ) -> int64x2_t; fn vmlsl_high_laneq_s32( a: int64x2_t, b: int32x4_t, c: int32x4_t, ) -> int64x2_t; fn vmlsl_high_lane_u16( a: uint32x4_t, b: uint16x8_t, c: uint16x4_t, ) -> uint32x4_t; fn vmlsl_high_laneq_u16( a: uint32x4_t, b: uint16x8_t, c: uint16x8_t, ) -> uint32x4_t; fn vmlsl_high_lane_u32( a: uint64x2_t, b: uint32x4_t, c: uint32x2_t, ) -> uint64x2_t; fn vmlsl_high_laneq_u32( a: uint64x2_t, b: uint32x4_t, c: uint32x4_t, ) -> uint64x2_t; fn vmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t; fn vmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t; fn vmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t; fn vmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t; fn vmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t; fn vmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t; fn vneg_s64(a: int64x1_t) -> int64x1_t; fn vnegq_s64(a: int64x2_t) -> int64x2_t; fn vnegd_s64(a: i64) -> i64; fn vneg_f64(a: float64x1_t) -> float64x1_t; fn vnegq_f64(a: float64x2_t) -> float64x2_t; fn vqneg_s64(a: int64x1_t) -> int64x1_t; fn vqnegq_s64(a: int64x2_t) -> int64x2_t; fn vqnegb_s8(a: i8) -> i8; fn vqnegh_s16(a: i16) -> i16; fn vqnegs_s32(a: i32) -> i32; fn vqnegd_s64(a: i64) -> i64; fn vqsubb_s8(a: i8, b: i8) -> i8; fn vqsubh_s16(a: i16, b: i16) -> i16; fn vqsubb_u8(a: u8, b: u8) -> u8; fn vqsubh_u16(a: u16, b: u16) -> u16; fn vqsubs_u32(a: u32, b: u32) -> u32; fn vqsubd_u64(a: u64, b: u64) -> u64; fn vqsubs_s32(a: i32, b: i32) -> i32; fn vqsubd_s64(a: i64, b: i64) -> i64; fn vrbit_s8(a: int8x8_t) -> int8x8_t; fn vrbitq_s8(a: int8x16_t) -> int8x16_t; fn vrbit_u8(a: uint8x8_t) -> uint8x8_t; fn vrbitq_u8(a: uint8x16_t) -> uint8x16_t; fn vrbit_p8(a: poly8x8_t) -> poly8x8_t; fn vrbitq_p8(a: poly8x16_t) -> poly8x16_t; fn vrndx_f32(a: float32x2_t) -> float32x2_t; fn vrndxq_f32(a: float32x4_t) -> float32x4_t; fn vrndx_f64(a: float64x1_t) -> float64x1_t; fn vrndxq_f64(a: float64x2_t) -> float64x2_t; fn vrnda_f32(a: float32x2_t) -> float32x2_t; fn vrndaq_f32(a: float32x4_t) -> float32x4_t; fn vrnda_f64(a: float64x1_t) -> float64x1_t; fn vrndaq_f64(a: float64x2_t) -> float64x2_t; fn vrndn_f64(a: float64x1_t) -> float64x1_t; fn vrndnq_f64(a: float64x2_t) -> float64x2_t; fn vrndns_f32(a: f32) -> f32; fn vrndm_f32(a: float32x2_t) -> float32x2_t; fn vrndmq_f32(a: float32x4_t) -> float32x4_t; fn vrndm_f64(a: float64x1_t) -> float64x1_t; fn vrndmq_f64(a: float64x2_t) -> float64x2_t; fn vrndp_f32(a: float32x2_t) -> float32x2_t; fn vrndpq_f32(a: float32x4_t) -> float32x4_t; fn vrndp_f64(a: float64x1_t) -> float64x1_t; fn vrndpq_f64(a: float64x2_t) -> float64x2_t; fn vrnd_f32(a: float32x2_t) -> float32x2_t; fn vrndq_f32(a: float32x4_t) -> float32x4_t; fn vrnd_f64(a: float64x1_t) -> float64x1_t; fn vrndq_f64(a: float64x2_t) -> float64x2_t; fn vrndi_f32(a: float32x2_t) -> float32x2_t; fn vrndiq_f32(a: float32x4_t) -> float32x4_t; fn vrndi_f64(a: float64x1_t) -> float64x1_t; fn vrndiq_f64(a: float64x2_t) -> float64x2_t; fn vqaddb_s8(a: i8, b: i8) -> i8; fn vqaddh_s16(a: i16, b: i16) -> i16; fn vqaddb_u8(a: u8, b: u8) -> u8; fn vqaddh_u16(a: u16, b: u16) -> u16; fn vqadds_u32(a: u32, b: u32) -> u32; fn vqaddd_u64(a: u64, b: u64) -> u64; fn vqadds_s32(a: i32, b: i32) -> i32; fn vqaddd_s64(a: i64, b: i64) -> i64; unsafe fn vld1_f64_x2(a: *const f64) -> float64x1x2_t; unsafe fn vld1q_f64_x2(a: *const f64) -> float64x2x2_t; unsafe fn vld1_f64_x3(a: *const f64) -> float64x1x3_t; unsafe fn vld1q_f64_x3(a: *const f64) -> float64x2x3_t; unsafe fn vld1_f64_x4(a: *const f64) -> float64x1x4_t; unsafe fn vld1q_f64_x4(a: *const f64) -> float64x2x4_t; unsafe fn vld2q_s64(a: *const i64) -> int64x2x2_t; unsafe fn vld2q_u64(a: *const u64) -> uint64x2x2_t; unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t; unsafe fn vld2q_f64(a: *const f64) -> float64x2x2_t; unsafe fn vld2q_dup_s64(a: *const i64) -> int64x2x2_t; unsafe fn vld2q_dup_u64(a: *const u64) -> uint64x2x2_t; unsafe fn vld2_dup_f64(a: *const f64) -> float64x1x2_t; unsafe fn vld2q_dup_f64(a: *const f64) -> float64x2x2_t; unsafe fn vld2q_lane_s8(a: *const i8, b: int8x16x2_t) -> int8x16x2_t; unsafe fn vld2_lane_s64(a: *const i64, b: int64x1x2_t) -> int64x1x2_t; unsafe fn vld2q_lane_s64(a: *const i64, b: int64x2x2_t) -> int64x2x2_t; unsafe fn vld2q_lane_u8(a: *const u8, b: uint8x16x2_t) -> uint8x16x2_t; unsafe fn vld2_lane_u64(a: *const u64, b: uint64x1x2_t) -> uint64x1x2_t; unsafe fn vld2q_lane_u64(a: *const u64, b: uint64x2x2_t) -> uint64x2x2_t; unsafe fn vld2q_lane_p8(a: *const p8, b: poly8x16x2_t) -> poly8x16x2_t; unsafe fn vld2_lane_f64(a: *const f64, b: float64x1x2_t) -> float64x1x2_t; unsafe fn vld2q_lane_f64(a: *const f64, b: float64x2x2_t) -> float64x2x2_t; unsafe fn vld3q_s64(a: *const i64) -> int64x2x3_t; unsafe fn vld3q_u64(a: *const u64) -> uint64x2x3_t; unsafe fn vld3_f64(a: *const f64) -> float64x1x3_t; unsafe fn vld3q_f64(a: *const f64) -> float64x2x3_t; unsafe fn vld3q_dup_s64(a: *const i64) -> int64x2x3_t; unsafe fn vld3q_dup_u64(a: *const u64) -> uint64x2x3_t; unsafe fn vld3_dup_f64(a: *const f64) -> float64x1x3_t; unsafe fn vld3q_dup_f64(a: *const f64) -> float64x2x3_t; unsafe fn vld3q_lane_s8(a: *const i8, b: int8x16x3_t) -> int8x16x3_t; unsafe fn vld3_lane_s64(a: *const i64, b: int64x1x3_t) -> int64x1x3_t; unsafe fn vld3q_lane_s64(a: *const i64, b: int64x2x3_t) -> int64x2x3_t; unsafe fn vld3q_lane_p8(a: *const p8, b: poly8x16x3_t) -> poly8x16x3_t; unsafe fn vld3q_lane_u8(a: *const u8, b: uint8x16x3_t) -> uint8x16x3_t; unsafe fn vld3_lane_u64(a: *const u64, b: uint64x1x3_t) -> uint64x1x3_t; unsafe fn vld3q_lane_u64(a: *const u64, b: uint64x2x3_t) -> uint64x2x3_t; unsafe fn vld3_lane_f64(a: *const f64, b: float64x1x3_t) -> float64x1x3_t; unsafe fn vld3q_lane_f64(a: *const f64, b: float64x2x3_t) -> float64x2x3_t; unsafe fn vld4q_s64(a: *const i64) -> int64x2x4_t; unsafe fn vld4q_u64(a: *const u64) -> uint64x2x4_t; unsafe fn vld4_f64(a: *const f64) -> float64x1x4_t; unsafe fn vld4q_f64(a: *const f64) -> float64x2x4_t; unsafe fn vld4q_dup_s64(a: *const i64) -> int64x2x4_t; unsafe fn vld4q_dup_u64(a: *const u64) -> uint64x2x4_t; unsafe fn vld4_dup_f64(a: *const f64) -> float64x1x4_t; unsafe fn vld4q_dup_f64(a: *const f64) -> float64x2x4_t; unsafe fn vld4q_lane_s8(a: *const i8, b: int8x16x4_t) -> int8x16x4_t; unsafe fn vld4_lane_s64(a: *const i64, b: int64x1x4_t) -> int64x1x4_t; unsafe fn vld4q_lane_s64(a: *const i64, b: int64x2x4_t) -> int64x2x4_t; unsafe fn vld4q_lane_p8(a: *const p8, b: poly8x16x4_t) -> poly8x16x4_t; unsafe fn vld4q_lane_u8(a: *const u8, b: uint8x16x4_t) -> uint8x16x4_t; unsafe fn vld4_lane_u64(a: *const u64, b: uint64x1x4_t) -> uint64x1x4_t; unsafe fn vld4q_lane_u64(a: *const u64, b: uint64x2x4_t) -> uint64x2x4_t; unsafe fn vld4_lane_f64(a: *const f64, b: float64x1x4_t) -> float64x1x4_t; unsafe fn vld4q_lane_f64(a: *const f64, b: float64x2x4_t) -> float64x2x4_t; unsafe fn vst1_lane_f64(a: *mut f64, b: float64x1_t); unsafe fn vst1q_lane_f64(a: *mut f64, b: float64x2_t); unsafe fn vst1_f64_x2(a: *mut f64, b: float64x1x2_t); unsafe fn vst1q_f64_x2(a: *mut f64, b: float64x2x2_t); unsafe fn vst1_f64_x3(a: *mut f64, b: float64x1x3_t); unsafe fn vst1q_f64_x3(a: *mut f64, b: float64x2x3_t); unsafe fn vst1_f64_x4(a: *mut f64, b: float64x1x4_t); unsafe fn vst1q_f64_x4(a: *mut f64, b: float64x2x4_t); unsafe fn vst2q_s64(a: *mut i64, b: int64x2x2_t); unsafe fn vst2q_u64(a: *mut u64, b: uint64x2x2_t); unsafe fn vst2_f64(a: *mut f64, b: float64x1x2_t); unsafe fn vst2q_f64(a: *mut f64, b: float64x2x2_t); unsafe fn vst2q_lane_s8(a: *mut i8, b: int8x16x2_t); unsafe fn vst2_lane_s64(a: *mut i64, b: int64x1x2_t); unsafe fn vst2q_lane_s64(a: *mut i64, b: int64x2x2_t); unsafe fn vst2q_lane_u8(a: *mut u8, b: uint8x16x2_t); unsafe fn vst2_lane_u64(a: *mut u64, b: uint64x1x2_t); unsafe fn vst2q_lane_u64(a: *mut u64, b: uint64x2x2_t); unsafe fn vst2q_lane_p8(a: *mut p8, b: poly8x16x2_t); unsafe fn vst2_lane_f64(a: *mut f64, b: float64x1x2_t); unsafe fn vst2q_lane_f64(a: *mut f64, b: float64x2x2_t); unsafe fn vst3q_s64(a: *mut i64, b: int64x2x3_t); unsafe fn vst3q_u64(a: *mut u64, b: uint64x2x3_t); unsafe fn vst3_f64(a: *mut f64, b: float64x1x3_t); unsafe fn vst3q_f64(a: *mut f64, b: float64x2x3_t); unsafe fn vst3q_lane_s8(a: *mut i8, b: int8x16x3_t); unsafe fn vst3_lane_s64(a: *mut i64, b: int64x1x3_t); unsafe fn vst3q_lane_s64(a: *mut i64, b: int64x2x3_t); unsafe fn vst3q_lane_u8(a: *mut u8, b: uint8x16x3_t); unsafe fn vst3_lane_u64(a: *mut u64, b: uint64x1x3_t); unsafe fn vst3q_lane_u64(a: *mut u64, b: uint64x2x3_t); unsafe fn vst3q_lane_p8(a: *mut p8, b: poly8x16x3_t); unsafe fn vst3_lane_f64(a: *mut f64, b: float64x1x3_t); unsafe fn vst3q_lane_f64(a: *mut f64, b: float64x2x3_t); unsafe fn vst4q_s64(a: *mut i64, b: int64x2x4_t); unsafe fn vst4q_u64(a: *mut u64, b: uint64x2x4_t); unsafe fn vst4_f64(a: *mut f64, b: float64x1x4_t); unsafe fn vst4q_f64(a: *mut f64, b: float64x2x4_t); unsafe fn vst4q_lane_s8(a: *mut i8, b: int8x16x4_t); unsafe fn vst4_lane_s64(a: *mut i64, b: int64x1x4_t); unsafe fn vst4q_lane_s64(a: *mut i64, b: int64x2x4_t); unsafe fn vst4q_lane_u8(a: *mut u8, b: uint8x16x4_t); unsafe fn vst4_lane_u64(a: *mut u64, b: uint64x1x4_t); unsafe fn vst4q_lane_u64(a: *mut u64, b: uint64x2x4_t); unsafe fn vst4q_lane_p8(a: *mut p8, b: poly8x16x4_t); unsafe fn vst4_lane_f64(a: *mut f64, b: float64x1x4_t); unsafe fn vst4q_lane_f64(a: *mut f64, b: float64x2x4_t); fn vmul_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vmulq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vmul_n_f64(a: float64x1_t, b: f64) -> float64x1_t; fn vmulq_n_f64(a: float64x2_t, b: f64) -> float64x2_t; fn vmul_lane_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vmul_laneq_f64(a: float64x1_t, b: float64x2_t) -> float64x1_t; fn vmulq_lane_f64(a: float64x2_t, b: float64x1_t) -> float64x2_t; fn vmulq_laneq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vmuls_lane_f32(a: f32, b: float32x2_t) -> f32; fn vmuls_laneq_f32(a: f32, b: float32x4_t) -> f32; fn vmuld_lane_f64(a: f64, b: float64x1_t) -> f64; fn vmuld_laneq_f64(a: f64, b: float64x2_t) -> f64; fn vmull_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t; fn vmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t; fn vmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t; fn vmull_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t; fn vmull_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t; fn vmull_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t; fn vmull_high_p8(a: poly8x16_t, b: poly8x16_t) -> poly16x8_t; fn vmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t; fn vmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t; fn vmull_high_n_u16(a: uint16x8_t, b: u16) -> uint32x4_t; fn vmull_high_n_u32(a: uint32x4_t, b: u32) -> uint64x2_t; fn vmull_high_lane_s16(a: int16x8_t, b: int16x4_t) -> int32x4_t; fn vmull_high_laneq_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t; fn vmull_high_lane_s32(a: int32x4_t, b: int32x2_t) -> int64x2_t; fn vmull_high_laneq_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t; fn vmull_high_lane_u16(a: uint16x8_t, b: uint16x4_t) -> uint32x4_t; fn vmull_high_laneq_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t; fn vmull_high_lane_u32(a: uint32x4_t, b: uint32x2_t) -> uint64x2_t; fn vmull_high_laneq_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t; fn vmulx_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vmulxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vmulx_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vmulxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vmulx_lane_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vmulx_laneq_f64(a: float64x1_t, b: float64x2_t) -> float64x1_t; fn vmulx_lane_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vmulx_laneq_f32(a: float32x2_t, b: float32x4_t) -> float32x2_t; fn vmulxq_lane_f32(a: float32x4_t, b: float32x2_t) -> float32x4_t; fn vmulxq_laneq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vmulxq_lane_f64(a: float64x2_t, b: float64x1_t) -> float64x2_t; fn vmulxq_laneq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vmulxs_f32(a: f32, b: f32) -> f32; fn vmulxd_f64(a: f64, b: f64) -> f64; fn vmulxs_lane_f32(a: f32, b: float32x2_t) -> f32; fn vmulxs_laneq_f32(a: f32, b: float32x4_t) -> f32; fn vmulxd_lane_f64(a: f64, b: float64x1_t) -> f64; fn vmulxd_laneq_f64(a: f64, b: float64x2_t) -> f64; fn vfma_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t; fn vfmaq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t; fn vfma_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t; fn vfmaq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t; fn vfma_lane_f32( a: float32x2_t, b: float32x2_t, c: float32x2_t, ) -> float32x2_t; fn vfma_laneq_f32( a: float32x2_t, b: float32x2_t, c: float32x4_t, ) -> float32x2_t; fn vfmaq_lane_f32( a: float32x4_t, b: float32x4_t, c: float32x2_t, ) -> float32x4_t; fn vfmaq_laneq_f32( a: float32x4_t, b: float32x4_t, c: float32x4_t, ) -> float32x4_t; fn vfma_lane_f64( a: float64x1_t, b: float64x1_t, c: float64x1_t, ) -> float64x1_t; fn vfma_laneq_f64( a: float64x1_t, b: float64x1_t, c: float64x2_t, ) -> float64x1_t; fn vfmaq_lane_f64( a: float64x2_t, b: float64x2_t, c: float64x1_t, ) -> float64x2_t; fn vfmaq_laneq_f64( a: float64x2_t, b: float64x2_t, c: float64x2_t, ) -> float64x2_t; fn vfmas_lane_f32(a: f32, b: f32, c: float32x2_t) -> f32; fn vfmas_laneq_f32(a: f32, b: f32, c: float32x4_t) -> f32; fn vfmad_lane_f64(a: f64, b: f64, c: float64x1_t) -> f64; fn vfmad_laneq_f64(a: f64, b: f64, c: float64x2_t) -> f64; fn vfms_f64(a: float64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t; fn vfmsq_f64(a: float64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t; fn vfms_n_f64(a: float64x1_t, b: float64x1_t, c: f64) -> float64x1_t; fn vfmsq_n_f64(a: float64x2_t, b: float64x2_t, c: f64) -> float64x2_t; fn vfms_lane_f32( a: float32x2_t, b: float32x2_t, c: float32x2_t, ) -> float32x2_t; fn vfms_laneq_f32( a: float32x2_t, b: float32x2_t, c: float32x4_t, ) -> float32x2_t; fn vfmsq_lane_f32( a: float32x4_t, b: float32x4_t, c: float32x2_t, ) -> float32x4_t; fn vfmsq_laneq_f32( a: float32x4_t, b: float32x4_t, c: float32x4_t, ) -> float32x4_t; fn vfms_lane_f64( a: float64x1_t, b: float64x1_t, c: float64x1_t, ) -> float64x1_t; fn vfms_laneq_f64( a: float64x1_t, b: float64x1_t, c: float64x2_t, ) -> float64x1_t; fn vfmsq_lane_f64( a: float64x2_t, b: float64x2_t, c: float64x1_t, ) -> float64x2_t; fn vfmsq_laneq_f64( a: float64x2_t, b: float64x2_t, c: float64x2_t, ) -> float64x2_t; fn vfmss_lane_f32(a: f32, b: f32, c: float32x2_t) -> f32; fn vfmss_laneq_f32(a: f32, b: f32, c: float32x4_t) -> f32; fn vfmsd_lane_f64(a: f64, b: f64, c: float64x1_t) -> f64; fn vfmsd_laneq_f64(a: f64, b: f64, c: float64x2_t) -> f64; fn vdiv_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vdivq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vdiv_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vdivq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vsub_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vsubq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vsubd_s64(a: i64, b: i64) -> i64; fn vsubd_u64(a: u64, b: u64) -> u64; fn vaddv_f32(a: float32x2_t) -> f32; fn vaddvq_f32(a: float32x4_t) -> f32; fn vaddvq_f64(a: float64x2_t) -> f64; fn vaddlv_s16(a: int16x4_t) -> i32; fn vaddlvq_s16(a: int16x8_t) -> i32; fn vaddlv_s32(a: int32x2_t) -> i64; fn vaddlvq_s32(a: int32x4_t) -> i64; fn vaddlv_u16(a: uint16x4_t) -> u32; fn vaddlvq_u16(a: uint16x8_t) -> u32; fn vaddlv_u32(a: uint32x2_t) -> u64; fn vaddlvq_u32(a: uint32x4_t) -> u64; fn vsubw_high_s8(a: int16x8_t, b: int8x16_t) -> int16x8_t; fn vsubw_high_s16(a: int32x4_t, b: int16x8_t) -> int32x4_t; fn vsubw_high_s32(a: int64x2_t, b: int32x4_t) -> int64x2_t; fn vsubw_high_u8(a: uint16x8_t, b: uint8x16_t) -> uint16x8_t; fn vsubw_high_u16(a: uint32x4_t, b: uint16x8_t) -> uint32x4_t; fn vsubw_high_u32(a: uint64x2_t, b: uint32x4_t) -> uint64x2_t; fn vsubl_high_s8(a: int8x16_t, b: int8x16_t) -> int16x8_t; fn vsubl_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t; fn vsubl_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t; fn vsubl_high_u8(a: uint8x16_t, b: uint8x16_t) -> uint16x8_t; fn vsubl_high_u16(a: uint16x8_t, b: uint16x8_t) -> uint32x4_t; fn vsubl_high_u32(a: uint32x4_t, b: uint32x4_t) -> uint64x2_t; fn vmax_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vmaxnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vmaxnmv_f32(a: float32x2_t) -> f32; fn vmaxnmvq_f64(a: float64x2_t) -> f64; fn vmaxnmvq_f32(a: float32x4_t) -> f32; fn vpmaxnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vpmaxnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vpmaxnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vpmaxnms_f32(a: float32x2_t) -> f32; fn vpmaxnmqd_f64(a: float64x2_t) -> f64; fn vpmaxs_f32(a: float32x2_t) -> f32; fn vpmaxqd_f64(a: float64x2_t) -> f64; fn vmin_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vminnm_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vminnmv_f32(a: float32x2_t) -> f32; fn vminnmvq_f64(a: float64x2_t) -> f64; fn vminnmvq_f32(a: float32x4_t) -> f32; fn vmovl_high_s8(a: int8x16_t) -> int16x8_t; fn vmovl_high_s16(a: int16x8_t) -> int32x4_t; fn vmovl_high_s32(a: int32x4_t) -> int64x2_t; fn vmovl_high_u8(a: uint8x16_t) -> uint16x8_t; fn vmovl_high_u16(a: uint16x8_t) -> uint32x4_t; fn vmovl_high_u32(a: uint32x4_t) -> uint64x2_t; fn vpaddq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vpaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vpadds_f32(a: float32x2_t) -> f32; fn vpaddd_f64(a: float64x2_t) -> f64; fn vpminnm_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vpminnmq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vpminnmq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vpminnms_f32(a: float32x2_t) -> f32; fn vpminnmqd_f64(a: float64x2_t) -> f64; fn vpmins_f32(a: float32x2_t) -> f32; fn vpminqd_f64(a: float64x2_t) -> f64; fn vqdmullh_s16(a: i16, b: i16) -> i32; fn vqdmulls_s32(a: i32, b: i32) -> i64; fn vqdmull_high_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t; fn vqdmull_high_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t; fn vqdmull_high_n_s16(a: int16x8_t, b: i16) -> int32x4_t; fn vqdmull_high_n_s32(a: int32x4_t, b: i32) -> int64x2_t; fn vqdmull_laneq_s16(a: int16x4_t, b: int16x8_t) -> int32x4_t; fn vqdmull_laneq_s32(a: int32x2_t, b: int32x4_t) -> int64x2_t; fn vqdmullh_lane_s16(a: i16, b: int16x4_t) -> i32; fn vqdmullh_laneq_s16(a: i16, b: int16x8_t) -> i32; fn vqdmulls_lane_s32(a: i32, b: int32x2_t) -> i64; fn vqdmulls_laneq_s32(a: i32, b: int32x4_t) -> i64; fn vqdmull_high_lane_s16(a: int16x8_t, b: int16x4_t) -> int32x4_t; fn vqdmull_high_lane_s32(a: int32x4_t, b: int32x2_t) -> int64x2_t; fn vqdmull_high_laneq_s16(a: int16x8_t, b: int16x8_t) -> int32x4_t; fn vqdmull_high_laneq_s32(a: int32x4_t, b: int32x4_t) -> int64x2_t; fn vqdmlal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t; fn vqdmlal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t; fn vqdmlal_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t; fn vqdmlal_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t; fn vqdmlal_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t; fn vqdmlal_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t; fn vqdmlal_high_lane_s16( a: int32x4_t, b: int16x8_t, c: int16x4_t, ) -> int32x4_t; fn vqdmlal_high_laneq_s16( a: int32x4_t, b: int16x8_t, c: int16x8_t, ) -> int32x4_t; fn vqdmlal_high_lane_s32( a: int64x2_t, b: int32x4_t, c: int32x2_t, ) -> int64x2_t; fn vqdmlal_high_laneq_s32( a: int64x2_t, b: int32x4_t, c: int32x4_t, ) -> int64x2_t; fn vqdmlalh_s16(a: i32, b: i16, c: i16) -> i32; fn vqdmlals_s32(a: i64, b: i32, c: i32) -> i64; fn vqdmlalh_lane_s16(a: i32, b: i16, c: int16x4_t) -> i32; fn vqdmlalh_laneq_s16(a: i32, b: i16, c: int16x8_t) -> i32; fn vqdmlals_lane_s32(a: i64, b: i32, c: int32x2_t) -> i64; fn vqdmlals_laneq_s32(a: i64, b: i32, c: int32x4_t) -> i64; fn vqdmlsl_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t; fn vqdmlsl_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t; fn vqdmlsl_high_n_s16(a: int32x4_t, b: int16x8_t, c: i16) -> int32x4_t; fn vqdmlsl_high_n_s32(a: int64x2_t, b: int32x4_t, c: i32) -> int64x2_t; fn vqdmlsl_laneq_s16(a: int32x4_t, b: int16x4_t, c: int16x8_t) -> int32x4_t; fn vqdmlsl_laneq_s32(a: int64x2_t, b: int32x2_t, c: int32x4_t) -> int64x2_t; fn vqdmlsl_high_lane_s16( a: int32x4_t, b: int16x8_t, c: int16x4_t, ) -> int32x4_t; fn vqdmlsl_high_laneq_s16( a: int32x4_t, b: int16x8_t, c: int16x8_t, ) -> int32x4_t; fn vqdmlsl_high_lane_s32( a: int64x2_t, b: int32x4_t, c: int32x2_t, ) -> int64x2_t; fn vqdmlsl_high_laneq_s32( a: int64x2_t, b: int32x4_t, c: int32x4_t, ) -> int64x2_t; fn vqdmlslh_s16(a: i32, b: i16, c: i16) -> i32; fn vqdmlsls_s32(a: i64, b: i32, c: i32) -> i64; fn vqdmlslh_lane_s16(a: i32, b: i16, c: int16x4_t) -> i32; fn vqdmlslh_laneq_s16(a: i32, b: i16, c: int16x8_t) -> i32; fn vqdmlsls_lane_s32(a: i64, b: i32, c: int32x2_t) -> i64; fn vqdmlsls_laneq_s32(a: i64, b: i32, c: int32x4_t) -> i64; fn vqdmulhh_s16(a: i16, b: i16) -> i16; fn vqdmulhs_s32(a: i32, b: i32) -> i32; fn vqdmulhh_lane_s16(a: i16, b: int16x4_t) -> i16; fn vqdmulhh_laneq_s16(a: i16, b: int16x8_t) -> i16; fn vqdmulhs_lane_s32(a: i32, b: int32x2_t) -> i32; fn vqdmulhs_laneq_s32(a: i32, b: int32x4_t) -> i32; fn vqdmulh_lane_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vqdmulhq_lane_s16(a: int16x8_t, b: int16x4_t) -> int16x8_t; fn vqdmulh_lane_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vqdmulhq_lane_s32(a: int32x4_t, b: int32x2_t) -> int32x4_t; fn vqmovnh_s16(a: i16) -> i8; fn vqmovns_s32(a: i32) -> i16; fn vqmovnh_u16(a: u16) -> u8; fn vqmovns_u32(a: u32) -> u16; fn vqmovnd_s64(a: i64) -> i32; fn vqmovnd_u64(a: u64) -> u32; fn vqmovn_high_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t; fn vqmovn_high_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t; fn vqmovn_high_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t; fn vqmovn_high_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t; fn vqmovn_high_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t; fn vqmovn_high_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t; fn vqmovunh_s16(a: i16) -> u8; fn vqmovuns_s32(a: i32) -> u16; fn vqmovund_s64(a: i64) -> u32; fn vqmovun_high_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_t; fn vqmovun_high_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8_t; fn vqmovun_high_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t; fn vqrdmulhh_s16(a: i16, b: i16) -> i16; fn vqrdmulhs_s32(a: i32, b: i32) -> i32; fn vqrdmulhh_lane_s16(a: i16, b: int16x4_t) -> i16; fn vqrdmulhh_laneq_s16(a: i16, b: int16x8_t) -> i16; fn vqrdmulhs_lane_s32(a: i32, b: int32x2_t) -> i32; fn vqrdmulhs_laneq_s32(a: i32, b: int32x4_t) -> i32; fn vqrdmlah_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t; fn vqrdmlahq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; fn vqrdmlah_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t; fn vqrdmlahq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; fn vqrdmlahh_s16(a: i16, b: i16, c: i16) -> i16; fn vqrdmlahs_s32(a: i32, b: i32, c: i32) -> i32; fn vqrdmlah_lane_s16( a: int16x4_t, b: int16x4_t, c: int16x4_t, ) -> int16x4_t; fn vqrdmlah_laneq_s16( a: int16x4_t, b: int16x4_t, c: int16x8_t, ) -> int16x4_t; fn vqrdmlahq_lane_s16( a: int16x8_t, b: int16x8_t, c: int16x4_t, ) -> int16x8_t; fn vqrdmlahq_laneq_s16( a: int16x8_t, b: int16x8_t, c: int16x8_t, ) -> int16x8_t; fn vqrdmlah_lane_s32( a: int32x2_t, b: int32x2_t, c: int32x2_t, ) -> int32x2_t; fn vqrdmlah_laneq_s32( a: int32x2_t, b: int32x2_t, c: int32x4_t, ) -> int32x2_t; fn vqrdmlahq_lane_s32( a: int32x4_t, b: int32x4_t, c: int32x2_t, ) -> int32x4_t; fn vqrdmlahq_laneq_s32( a: int32x4_t, b: int32x4_t, c: int32x4_t, ) -> int32x4_t; fn vqrdmlahh_lane_s16(a: i16, b: i16, c: int16x4_t) -> i16; fn vqrdmlahh_laneq_s16(a: i16, b: i16, c: int16x8_t) -> i16; fn vqrdmlahs_lane_s32(a: i32, b: i32, c: int32x2_t) -> i32; fn vqrdmlahs_laneq_s32(a: i32, b: i32, c: int32x4_t) -> i32; fn vqrdmlsh_s16(a: int16x4_t, b: int16x4_t, c: int16x4_t) -> int16x4_t; fn vqrdmlshq_s16(a: int16x8_t, b: int16x8_t, c: int16x8_t) -> int16x8_t; fn vqrdmlsh_s32(a: int32x2_t, b: int32x2_t, c: int32x2_t) -> int32x2_t; fn vqrdmlshq_s32(a: int32x4_t, b: int32x4_t, c: int32x4_t) -> int32x4_t; fn vqrdmlshh_s16(a: i16, b: i16, c: i16) -> i16; fn vqrdmlshs_s32(a: i32, b: i32, c: i32) -> i32; fn vqrdmlsh_lane_s16( a: int16x4_t, b: int16x4_t, c: int16x4_t, ) -> int16x4_t; fn vqrdmlsh_laneq_s16( a: int16x4_t, b: int16x4_t, c: int16x8_t, ) -> int16x4_t; fn vqrdmlshq_lane_s16( a: int16x8_t, b: int16x8_t, c: int16x4_t, ) -> int16x8_t; fn vqrdmlshq_laneq_s16( a: int16x8_t, b: int16x8_t, c: int16x8_t, ) -> int16x8_t; fn vqrdmlsh_lane_s32( a: int32x2_t, b: int32x2_t, c: int32x2_t, ) -> int32x2_t; fn vqrdmlsh_laneq_s32( a: int32x2_t, b: int32x2_t, c: int32x4_t, ) -> int32x2_t; fn vqrdmlshq_lane_s32( a: int32x4_t, b: int32x4_t, c: int32x2_t, ) -> int32x4_t; fn vqrdmlshq_laneq_s32( a: int32x4_t, b: int32x4_t, c: int32x4_t, ) -> int32x4_t; fn vqrdmlshh_lane_s16(a: i16, b: i16, c: int16x4_t) -> i16; fn vqrdmlshh_laneq_s16(a: i16, b: i16, c: int16x8_t) -> i16; fn vqrdmlshs_lane_s32(a: i32, b: i32, c: int32x2_t) -> i32; fn vqrdmlshs_laneq_s32(a: i32, b: i32, c: int32x4_t) -> i32; fn vqrshls_s32(a: i32, b: i32) -> i32; fn vqrshld_s64(a: i64, b: i64) -> i64; fn vqrshlb_s8(a: i8, b: i8) -> i8; fn vqrshlh_s16(a: i16, b: i16) -> i16; fn vqrshls_u32(a: u32, b: i32) -> u32; fn vqrshld_u64(a: u64, b: i64) -> u64; fn vqrshlb_u8(a: u8, b: i8) -> u8; fn vqrshlh_u16(a: u16, b: i16) -> u16; fn vqrshrnh_n_s16(a: i16) -> i8; fn vqrshrns_n_s32(a: i32) -> i16; fn vqrshrnd_n_s64(a: i64) -> i32; fn vqrshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t; fn vqrshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t; fn vqrshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t; fn vqrshrnh_n_u16(a: u16) -> u8; fn vqrshrns_n_u32(a: u32) -> u16; fn vqrshrnd_n_u64(a: u64) -> u32; fn vqrshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t; fn vqrshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t; fn vqrshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t; fn vqrshrunh_n_s16(a: i16) -> u8; fn vqrshruns_n_s32(a: i32) -> u16; fn vqrshrund_n_s64(a: i64) -> u32; fn vqrshrun_high_n_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_t; fn vqrshrun_high_n_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8_t; fn vqrshrun_high_n_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t; fn vqshld_s64(a: i64, b: i64) -> i64; fn vqshlb_s8(a: i8, b: i8) -> i8; fn vqshlh_s16(a: i16, b: i16) -> i16; fn vqshls_s32(a: i32, b: i32) -> i32; fn vqshld_u64(a: u64, b: i64) -> u64; fn vqshlb_u8(a: u8, b: i8) -> u8; fn vqshlh_u16(a: u16, b: i16) -> u16; fn vqshls_u32(a: u32, b: i32) -> u32; fn vqshlb_n_s8(a: i8) -> i8; fn vqshlh_n_s16(a: i16) -> i16; fn vqshls_n_s32(a: i32) -> i32; fn vqshld_n_s64(a: i64) -> i64; fn vqshlb_n_u8(a: u8) -> u8; fn vqshlh_n_u16(a: u16) -> u16; fn vqshls_n_u32(a: u32) -> u32; fn vqshld_n_u64(a: u64) -> u64; fn vqshlub_n_s8(a: i8) -> u8; fn vqshluh_n_s16(a: i16) -> u16; fn vqshlus_n_s32(a: i32) -> u32; fn vqshlud_n_s64(a: i64) -> u64; fn vqshrnd_n_s64(a: i64) -> i32; fn vqshrnh_n_s16(a: i16) -> i8; fn vqshrns_n_s32(a: i32) -> i16; fn vqshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t; fn vqshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t; fn vqshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t; fn vqshrnd_n_u64(a: u64) -> u32; fn vqshrnh_n_u16(a: u16) -> u8; fn vqshrns_n_u32(a: u32) -> u16; fn vqshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t; fn vqshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t; fn vqshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t; fn vqshrunh_n_s16(a: i16) -> u8; fn vqshruns_n_s32(a: i32) -> u16; fn vqshrund_n_s64(a: i64) -> u32; fn vqshrun_high_n_s16(a: uint8x8_t, b: int16x8_t) -> uint8x16_t; fn vqshrun_high_n_s32(a: uint16x4_t, b: int32x4_t) -> uint16x8_t; fn vqshrun_high_n_s64(a: uint32x2_t, b: int64x2_t) -> uint32x4_t; fn vsqaddb_u8(a: u8, b: i8) -> u8; fn vsqaddh_u16(a: u16, b: i16) -> u16; fn vsqadds_u32(a: u32, b: i32) -> u32; fn vsqaddd_u64(a: u64, b: i64) -> u64; fn vsqrt_f32(a: float32x2_t) -> float32x2_t; fn vsqrtq_f32(a: float32x4_t) -> float32x4_t; fn vsqrt_f64(a: float64x1_t) -> float64x1_t; fn vsqrtq_f64(a: float64x2_t) -> float64x2_t; fn vrsqrte_f64(a: float64x1_t) -> float64x1_t; fn vrsqrteq_f64(a: float64x2_t) -> float64x2_t; fn vrsqrtes_f32(a: f32) -> f32; fn vrsqrted_f64(a: f64) -> f64; fn vrsqrts_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vrsqrtsq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vrsqrtss_f32(a: f32, b: f32) -> f32; fn vrsqrtsd_f64(a: f64, b: f64) -> f64; fn vrecpe_f64(a: float64x1_t) -> float64x1_t; fn vrecpeq_f64(a: float64x2_t) -> float64x2_t; fn vrecpes_f32(a: f32) -> f32; fn vrecped_f64(a: f64) -> f64; fn vrecps_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vrecpsq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vrecpss_f32(a: f32, b: f32) -> f32; fn vrecpsd_f64(a: f64, b: f64) -> f64; fn vrecpxs_f32(a: f32) -> f32; fn vrecpxd_f64(a: f64) -> f64; fn vreinterpret_s64_p64(a: poly64x1_t) -> int64x1_t; fn vreinterpret_u64_p64(a: poly64x1_t) -> uint64x1_t; fn vreinterpret_p64_s64(a: int64x1_t) -> poly64x1_t; fn vreinterpret_p64_u64(a: uint64x1_t) -> poly64x1_t; fn vreinterpretq_s64_p64(a: poly64x2_t) -> int64x2_t; fn vreinterpretq_u64_p64(a: poly64x2_t) -> uint64x2_t; fn vreinterpretq_p64_s64(a: int64x2_t) -> poly64x2_t; fn vreinterpretq_p64_u64(a: uint64x2_t) -> poly64x2_t; fn vreinterpret_s8_f64(a: float64x1_t) -> int8x8_t; fn vreinterpret_s16_f64(a: float64x1_t) -> int16x4_t; fn vreinterpret_s32_f64(a: float64x1_t) -> int32x2_t; fn vreinterpret_s64_f64(a: float64x1_t) -> int64x1_t; fn vreinterpretq_s8_f64(a: float64x2_t) -> int8x16_t; fn vreinterpretq_s16_f64(a: float64x2_t) -> int16x8_t; fn vreinterpretq_s32_f64(a: float64x2_t) -> int32x4_t; fn vreinterpretq_s64_f64(a: float64x2_t) -> int64x2_t; fn vreinterpret_u8_f64(a: float64x1_t) -> uint8x8_t; fn vreinterpret_u16_f64(a: float64x1_t) -> uint16x4_t; fn vreinterpret_u32_f64(a: float64x1_t) -> uint32x2_t; fn vreinterpret_u64_f64(a: float64x1_t) -> uint64x1_t; fn vreinterpretq_u8_f64(a: float64x2_t) -> uint8x16_t; fn vreinterpretq_u16_f64(a: float64x2_t) -> uint16x8_t; fn vreinterpretq_u32_f64(a: float64x2_t) -> uint32x4_t; fn vreinterpretq_u64_f64(a: float64x2_t) -> uint64x2_t; fn vreinterpret_p8_f64(a: float64x1_t) -> poly8x8_t; fn vreinterpret_p16_f64(a: float64x1_t) -> poly16x4_t; fn vreinterpret_p64_f32(a: float32x2_t) -> poly64x1_t; fn vreinterpret_p64_f64(a: float64x1_t) -> poly64x1_t; fn vreinterpretq_p8_f64(a: float64x2_t) -> poly8x16_t; fn vreinterpretq_p16_f64(a: float64x2_t) -> poly16x8_t; fn vreinterpretq_p64_f32(a: float32x4_t) -> poly64x2_t; fn vreinterpretq_p64_f64(a: float64x2_t) -> poly64x2_t; fn vreinterpretq_p128_f64(a: float64x2_t) -> p128; fn vreinterpret_f64_s8(a: int8x8_t) -> float64x1_t; fn vreinterpret_f64_s16(a: int16x4_t) -> float64x1_t; fn vreinterpret_f64_s32(a: int32x2_t) -> float64x1_t; fn vreinterpret_f64_s64(a: int64x1_t) -> float64x1_t; fn vreinterpretq_f64_s8(a: int8x16_t) -> float64x2_t; fn vreinterpretq_f64_s16(a: int16x8_t) -> float64x2_t; fn vreinterpretq_f64_s32(a: int32x4_t) -> float64x2_t; fn vreinterpretq_f64_s64(a: int64x2_t) -> float64x2_t; fn vreinterpret_f64_p8(a: poly8x8_t) -> float64x1_t; fn vreinterpret_f64_u16(a: uint16x4_t) -> float64x1_t; fn vreinterpret_f64_u32(a: uint32x2_t) -> float64x1_t; fn vreinterpret_f64_u64(a: uint64x1_t) -> float64x1_t; fn vreinterpretq_f64_p8(a: poly8x16_t) -> float64x2_t; fn vreinterpretq_f64_u16(a: uint16x8_t) -> float64x2_t; fn vreinterpretq_f64_u32(a: uint32x4_t) -> float64x2_t; fn vreinterpretq_f64_u64(a: uint64x2_t) -> float64x2_t; fn vreinterpret_f64_u8(a: uint8x8_t) -> float64x1_t; fn vreinterpret_f64_p16(a: poly16x4_t) -> float64x1_t; fn vreinterpret_f64_p64(a: poly64x1_t) -> float64x1_t; fn vreinterpret_f32_p64(a: poly64x1_t) -> float32x2_t; fn vreinterpretq_f64_u8(a: uint8x16_t) -> float64x2_t; fn vreinterpretq_f64_p16(a: poly16x8_t) -> float64x2_t; fn vreinterpretq_f64_p64(a: poly64x2_t) -> float64x2_t; fn vreinterpretq_f32_p64(a: poly64x2_t) -> float32x4_t; fn vreinterpretq_f64_p128(a: p128) -> float64x2_t; fn vreinterpret_f64_f32(a: float32x2_t) -> float64x1_t; fn vreinterpret_f32_f64(a: float64x1_t) -> float32x2_t; fn vreinterpretq_f64_f32(a: float32x4_t) -> float64x2_t; fn vreinterpretq_f32_f64(a: float64x2_t) -> float32x4_t; fn vrshld_s64(a: i64, b: i64) -> i64; fn vrshld_u64(a: u64, b: i64) -> u64; fn vrshrd_n_s64(a: i64) -> i64; fn vrshrd_n_u64(a: u64) -> u64; fn vrshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t; fn vrshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t; fn vrshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t; fn vrshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t; fn vrshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t; fn vrshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t; fn vrsrad_n_s64(a: i64, b: i64) -> i64; fn vrsrad_n_u64(a: u64, b: u64) -> u64; fn vrsubhn_high_s16(a: int8x8_t, b: int16x8_t, c: int16x8_t) -> int8x16_t; fn vrsubhn_high_s32(a: int16x4_t, b: int32x4_t, c: int32x4_t) -> int16x8_t; fn vrsubhn_high_s64(a: int32x2_t, b: int64x2_t, c: int64x2_t) -> int32x4_t; fn vrsubhn_high_u16(a: uint8x8_t, b: uint16x8_t, c: uint16x8_t) -> uint8x16_t; fn vrsubhn_high_u32(a: uint16x4_t, b: uint32x4_t, c: uint32x4_t) -> uint16x8_t; fn vrsubhn_high_u64(a: uint32x2_t, b: uint64x2_t, c: uint64x2_t) -> uint32x4_t; fn vset_lane_f64(a: f64, b: float64x1_t) -> float64x1_t; fn vsetq_lane_f64(a: f64, b: float64x2_t) -> float64x2_t; fn vshld_s64(a: i64, b: i64) -> i64; fn vshld_u64(a: u64, b: i64) -> u64; fn vshll_high_n_s8(a: int8x16_t) -> int16x8_t; fn vshll_high_n_s16(a: int16x8_t) -> int32x4_t; fn vshll_high_n_s32(a: int32x4_t) -> int64x2_t; fn vshll_high_n_u8(a: uint8x16_t) -> uint16x8_t; fn vshll_high_n_u16(a: uint16x8_t) -> uint32x4_t; fn vshll_high_n_u32(a: uint32x4_t) -> uint64x2_t; fn vshrn_high_n_s16(a: int8x8_t, b: int16x8_t) -> int8x16_t; fn vshrn_high_n_s32(a: int16x4_t, b: int32x4_t) -> int16x8_t; fn vshrn_high_n_s64(a: int32x2_t, b: int64x2_t) -> int32x4_t; fn vshrn_high_n_u16(a: uint8x8_t, b: uint16x8_t) -> uint8x16_t; fn vshrn_high_n_u32(a: uint16x4_t, b: uint32x4_t) -> uint16x8_t; fn vshrn_high_n_u64(a: uint32x2_t, b: uint64x2_t) -> uint32x4_t; fn vtrn1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vtrn1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vtrn1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vtrn1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vtrn1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vtrn1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vtrn1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vtrn1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vtrn1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vtrn1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vtrn1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vtrn1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vtrn1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vtrn1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; fn vtrn1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vtrn1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vtrn1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vtrn1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vtrn1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t; fn vtrn1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vtrn1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vtrn1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vtrn2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vtrn2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vtrn2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vtrn2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vtrn2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vtrn2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vtrn2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vtrn2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vtrn2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vtrn2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vtrn2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vtrn2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vtrn2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vtrn2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; fn vtrn2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vtrn2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vtrn2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vtrn2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vtrn2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t; fn vtrn2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vtrn2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vtrn2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vzip1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vzip1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vzip1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vzip1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vzip1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vzip1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vzip1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vzip1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vzip1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vzip1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vzip1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vzip1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vzip1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vzip1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vzip1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vzip1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vzip1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vzip1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; fn vzip1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t; fn vzip1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vzip1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vzip1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vzip2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vzip2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vzip2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vzip2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vzip2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vzip2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vzip2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vzip2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vzip2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vzip2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vzip2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vzip2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vzip2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vzip2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vzip2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vzip2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vzip2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vzip2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; fn vzip2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t; fn vzip2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vzip2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vzip2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vuzp1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vuzp1q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vuzp1_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vuzp1q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vuzp1q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vuzp1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vuzp1q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vuzp1_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vuzp1q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vuzp1q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vuzp1_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vuzp1q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vuzp1_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vuzp1q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; fn vuzp1_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vuzp1q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vuzp1_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vuzp1q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vuzp1q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t; fn vuzp1q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vuzp1_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vuzp1q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vuzp2_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vuzp2q_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vuzp2_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vuzp2q_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vuzp2q_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vuzp2_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vuzp2q_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vuzp2_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vuzp2q_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vuzp2q_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vuzp2_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vuzp2q_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vuzp2_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vuzp2q_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; fn vuzp2_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vuzp2q_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vuzp2_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vuzp2q_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vuzp2q_p64(a: poly64x2_t, b: poly64x2_t) -> poly64x2_t; fn vuzp2q_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vuzp2_f32(a: float32x2_t, b: float32x2_t) -> float32x2_t; fn vuzp2q_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vabal_high_u8(a: uint16x8_t, b: uint8x16_t, c: uint8x16_t) -> uint16x8_t; fn vabal_high_u16(a: uint32x4_t, b: uint16x8_t, c: uint16x8_t) -> uint32x4_t; fn vabal_high_u32(a: uint64x2_t, b: uint32x4_t, c: uint32x4_t) -> uint64x2_t; fn vabal_high_s8(a: int16x8_t, b: int8x16_t, c: int8x16_t) -> int16x8_t; fn vabal_high_s16(a: int32x4_t, b: int16x8_t, c: int16x8_t) -> int32x4_t; fn vabal_high_s32(a: int64x2_t, b: int32x4_t, c: int32x4_t) -> int64x2_t; fn vqabs_s64(a: int64x1_t) -> int64x1_t; fn vqabsq_s64(a: int64x2_t) -> int64x2_t; fn vqabsb_s8(a: i8) -> i8; fn vqabsh_s16(a: i16) -> i16; fn vqabss_s32(a: i32) -> i32; fn vqabsd_s64(a: i64) -> i64; fn vslid_n_s64(a: i64, b: i64) -> i64; fn vslid_n_u64(a: u64, b: u64) -> u64; fn vsrid_n_s64(a: i64, b: i64) -> i64; fn vsrid_n_u64(a: u64, b: u64) -> u64; fn vcopy_lane_s64(_a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vcopy_lane_u64( _a: uint64x1_t, b: uint64x1_t, ) -> uint64x1_t; fn vcopy_lane_p64( _a: poly64x1_t, b: poly64x1_t, ) -> poly64x1_t; fn vcopy_lane_f64( _a: float64x1_t, b: float64x1_t, ) -> float64x1_t; fn vcopy_laneq_s64( _a: int64x1_t, b: int64x2_t, ) -> int64x1_t; fn vcopy_laneq_u64( _a: uint64x1_t, b: uint64x2_t, ) -> uint64x1_t; fn vcopy_laneq_p64( _a: poly64x1_t, b: poly64x2_t, ) -> poly64x1_t; fn vcopy_laneq_f64( _a: float64x1_t, b: float64x2_t, ) -> float64x1_t; unsafe fn vld1_s8(ptr: *const i8) -> int8x8_t; unsafe fn vld1q_s8(ptr: *const i8) -> int8x16_t; unsafe fn vld1_s16(ptr: *const i16) -> int16x4_t; unsafe fn vld1q_s16(ptr: *const i16) -> int16x8_t; unsafe fn vld1_s32(ptr: *const i32) -> int32x2_t; unsafe fn vld1q_s32(ptr: *const i32) -> int32x4_t; unsafe fn vld1_s64(ptr: *const i64) -> int64x1_t; unsafe fn vld1q_s64(ptr: *const i64) -> int64x2_t; unsafe fn vld1_u8(ptr: *const u8) -> uint8x8_t; unsafe fn vld1q_u8(ptr: *const u8) -> uint8x16_t; unsafe fn vld1_u16(ptr: *const u16) -> uint16x4_t; unsafe fn vld1q_u16(ptr: *const u16) -> uint16x8_t; unsafe fn vld1_u32(ptr: *const u32) -> uint32x2_t; unsafe fn vld1q_u32(ptr: *const u32) -> uint32x4_t; unsafe fn vld1_u64(ptr: *const u64) -> uint64x1_t; unsafe fn vld1q_u64(ptr: *const u64) -> uint64x2_t; unsafe fn vld1_p8(ptr: *const p8) -> poly8x8_t; unsafe fn vld1q_p8(ptr: *const p8) -> poly8x16_t; unsafe fn vld1_p16(ptr: *const p16) -> poly16x4_t; unsafe fn vld1q_p16(ptr: *const p16) -> poly16x8_t; unsafe fn vld1_f32(ptr: *const f32) -> float32x2_t; unsafe fn vld1q_f32(ptr: *const f32) -> float32x4_t; unsafe fn vld1_f64(ptr: *const f64) -> float64x1_t; unsafe fn vld1q_f64(ptr: *const f64) -> float64x2_t; unsafe fn vld1_dup_f64(ptr: *const f64) -> float64x1_t; unsafe fn vld1q_dup_f64(ptr: *const f64) -> float64x2_t; unsafe fn vld1_lane_f64(ptr: *const f64, src: float64x1_t) -> float64x1_t; unsafe fn vld1q_lane_f64(ptr: *const f64, src: float64x2_t) -> float64x2_t; unsafe fn vst1_s8(ptr: *mut i8, a: int8x8_t); unsafe fn vst1q_s8(ptr: *mut i8, a: int8x16_t); unsafe fn vst1_s16(ptr: *mut i16, a: int16x4_t); unsafe fn vst1q_s16(ptr: *mut i16, a: int16x8_t); unsafe fn vst1_s32(ptr: *mut i32, a: int32x2_t); unsafe fn vst1q_s32(ptr: *mut i32, a: int32x4_t); unsafe fn vst1_s64(ptr: *mut i64, a: int64x1_t); unsafe fn vst1q_s64(ptr: *mut i64, a: int64x2_t); unsafe fn vst1_u8(ptr: *mut u8, a: uint8x8_t); unsafe fn vst1q_u8(ptr: *mut u8, a: uint8x16_t); unsafe fn vst1_u16(ptr: *mut u16, a: uint16x4_t); unsafe fn vst1q_u16(ptr: *mut u16, a: uint16x8_t); unsafe fn vst1_u32(ptr: *mut u32, a: uint32x2_t); unsafe fn vst1q_u32(ptr: *mut u32, a: uint32x4_t); unsafe fn vst1_u64(ptr: *mut u64, a: uint64x1_t); unsafe fn vst1q_u64(ptr: *mut u64, a: uint64x2_t); unsafe fn vst1_p8(ptr: *mut p8, a: poly8x8_t); unsafe fn vst1q_p8(ptr: *mut p8, a: poly8x16_t); unsafe fn vst1_p16(ptr: *mut p16, a: poly16x4_t); unsafe fn vst1q_p16(ptr: *mut p16, a: poly16x8_t); unsafe fn vst1_f32(ptr: *mut f32, a: float32x2_t); unsafe fn vst1q_f32(ptr: *mut f32, a: float32x4_t); unsafe fn vst1_f64(ptr: *mut f64, a: float64x1_t); unsafe fn vst1q_f64(ptr: *mut f64, a: float64x2_t); fn vabsd_s64(a: i64) -> i64; fn vabs_s64(a: int64x1_t) -> int64x1_t; fn vabsq_s64(a: int64x2_t) -> int64x2_t; fn vbsl_f64(a: uint64x1_t, b: float64x1_t, c: float64x1_t) -> float64x1_t; fn vbsl_p64(a: poly64x1_t, b: poly64x1_t, c: poly64x1_t) -> poly64x1_t; fn vbslq_f64(a: uint64x2_t, b: float64x2_t, c: float64x2_t) -> float64x2_t; fn vbslq_p64(a: poly64x2_t, b: poly64x2_t, c: poly64x2_t) -> poly64x2_t; fn vuqadd_s8(a: int8x8_t, b: uint8x8_t) -> int8x8_t; fn vuqaddq_s8(a: int8x16_t, b: uint8x16_t) -> int8x16_t; fn vuqadd_s16(a: int16x4_t, b: uint16x4_t) -> int16x4_t; fn vuqaddq_s16(a: int16x8_t, b: uint16x8_t) -> int16x8_t; fn vuqadd_s32(a: int32x2_t, b: uint32x2_t) -> int32x2_t; fn vuqaddq_s32(a: int32x4_t, b: uint32x4_t) -> int32x4_t; fn vuqadd_s64(a: int64x1_t, b: uint64x1_t) -> int64x1_t; fn vuqaddq_s64(a: int64x2_t, b: uint64x2_t) -> int64x2_t; fn vsqadd_u8(a: uint8x8_t, b: int8x8_t) -> uint8x8_t; fn vsqaddq_u8(a: uint8x16_t, b: int8x16_t) -> uint8x16_t; fn vsqadd_u16(a: uint16x4_t, b: int16x4_t) -> uint16x4_t; fn vsqaddq_u16(a: uint16x8_t, b: int16x8_t) -> uint16x8_t; fn vsqadd_u32(a: uint32x2_t, b: int32x2_t) -> uint32x2_t; fn vsqaddq_u32(a: uint32x4_t, b: int32x4_t) -> uint32x4_t; fn vsqadd_u64(a: uint64x1_t, b: int64x1_t) -> uint64x1_t; fn vsqaddq_u64(a: uint64x2_t, b: int64x2_t) -> uint64x2_t; fn vpaddq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vpaddq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vpaddq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vpaddq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vpaddq_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vpaddq_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vpaddq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vpaddq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vpaddd_s64(a: int64x2_t) -> i64; fn vpaddd_u64(a: uint64x2_t) -> u64; fn vaddv_s16(a: int16x4_t) -> i16; fn vaddv_s32(a: int32x2_t) -> i32; fn vaddv_s8(a: int8x8_t) -> i8; fn vaddv_u16(a: uint16x4_t) -> u16; fn vaddv_u32(a: uint32x2_t) -> u32; fn vaddv_u8(a: uint8x8_t) -> u8; fn vaddvq_s16(a: int16x8_t) -> i16; fn vaddvq_s32(a: int32x4_t) -> i32; fn vaddvq_s8(a: int8x16_t) -> i8; fn vaddvq_u16(a: uint16x8_t) -> u16; fn vaddvq_u32(a: uint32x4_t) -> u32; fn vaddvq_u8(a: uint8x16_t) -> u8; fn vaddvq_s64(a: int64x2_t) -> i64; fn vaddvq_u64(a: uint64x2_t) -> u64; fn vaddlv_s8(a: int8x8_t) -> i16; fn vaddlvq_s8(a: int8x16_t) -> i16; fn vaddlv_u8(a: uint8x8_t) -> u16; fn vaddlvq_u8(a: uint8x16_t) -> u16; fn vadd_f64(a: float64x1_t, b: float64x1_t) -> float64x1_t; fn vaddq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vadd_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vadd_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vaddd_s64(a: i64, b: i64) -> i64; fn vaddd_u64(a: u64, b: u64) -> u64; fn vmaxv_s8(a: int8x8_t) -> i8; fn vmaxvq_s8(a: int8x16_t) -> i8; fn vmaxv_s16(a: int16x4_t) -> i16; fn vmaxvq_s16(a: int16x8_t) -> i16; fn vmaxv_s32(a: int32x2_t) -> i32; fn vmaxvq_s32(a: int32x4_t) -> i32; fn vmaxv_u8(a: uint8x8_t) -> u8; fn vmaxvq_u8(a: uint8x16_t) -> u8; fn vmaxv_u16(a: uint16x4_t) -> u16; fn vmaxvq_u16(a: uint16x8_t) -> u16; fn vmaxv_u32(a: uint32x2_t) -> u32; fn vmaxvq_u32(a: uint32x4_t) -> u32; fn vmaxv_f32(a: float32x2_t) -> f32; fn vmaxvq_f32(a: float32x4_t) -> f32; fn vmaxvq_f64(a: float64x2_t) -> f64; fn vminv_s8(a: int8x8_t) -> i8; fn vminvq_s8(a: int8x16_t) -> i8; fn vminv_s16(a: int16x4_t) -> i16; fn vminvq_s16(a: int16x8_t) -> i16; fn vminv_s32(a: int32x2_t) -> i32; fn vminvq_s32(a: int32x4_t) -> i32; fn vminv_u8(a: uint8x8_t) -> u8; fn vminvq_u8(a: uint8x16_t) -> u8; fn vminv_u16(a: uint16x4_t) -> u16; fn vminvq_u16(a: uint16x8_t) -> u16; fn vminv_u32(a: uint32x2_t) -> u32; fn vminvq_u32(a: uint32x4_t) -> u32; fn vminv_f32(a: float32x2_t) -> f32; fn vminvq_f32(a: float32x4_t) -> f32; fn vminvq_f64(a: float64x2_t) -> f64; fn vpminq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vpminq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vpminq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vpminq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vpminq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vpminq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vpminq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vpminq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vpmaxq_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vpmaxq_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vpmaxq_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vpmaxq_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vpmaxq_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vpmaxq_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vpmaxq_f32(a: float32x4_t, b: float32x4_t) -> float32x4_t; fn vpmaxq_f64(a: float64x2_t, b: float64x2_t) -> float64x2_t; fn vext_p64(a: poly64x1_t, _b: poly64x1_t) -> poly64x1_t; fn vext_f64(a: float64x1_t, _b: float64x1_t) -> float64x1_t; fn vdup_n_p64(value: p64) -> poly64x1_t; fn vdup_n_f64(value: f64) -> float64x1_t; fn vdupq_n_p64(value: p64) -> poly64x2_t; fn vdupq_n_f64(value: f64) -> float64x2_t; fn vmov_n_p64(value: p64) -> poly64x1_t; fn vmov_n_f64(value: f64) -> float64x1_t; fn vmovq_n_p64(value: p64) -> poly64x2_t; fn vmovq_n_f64(value: f64) -> float64x2_t; fn vget_high_f64(a: float64x2_t) -> float64x1_t; fn vget_high_p64(a: poly64x2_t) -> poly64x1_t; fn vget_low_f64(a: float64x2_t) -> float64x1_t; fn vget_low_p64(a: poly64x2_t) -> poly64x1_t; fn vget_lane_f64(v: float64x1_t) -> f64; fn vgetq_lane_f64(v: float64x2_t) -> f64; fn vcombine_f64(low: float64x1_t, high: float64x1_t) -> float64x2_t; fn vtbl1_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vtbl1_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vtbl1_p8(a: poly8x8_t, b: uint8x8_t) -> poly8x8_t; fn vtbl2_s8(a: int8x8x2_t, b: int8x8_t) -> int8x8_t; fn vtbl2_u8(a: uint8x8x2_t, b: uint8x8_t) -> uint8x8_t; fn vtbl2_p8(a: poly8x8x2_t, b: uint8x8_t) -> poly8x8_t; fn vtbl3_s8(a: int8x8x3_t, b: int8x8_t) -> int8x8_t; fn vtbl3_u8(a: uint8x8x3_t, b: uint8x8_t) -> uint8x8_t; fn vtbl3_p8(a: poly8x8x3_t, b: uint8x8_t) -> poly8x8_t; fn vtbl4_s8(a: int8x8x4_t, b: int8x8_t) -> int8x8_t; fn vtbl4_u8(a: uint8x8x4_t, b: uint8x8_t) -> uint8x8_t; fn vtbl4_p8(a: poly8x8x4_t, b: uint8x8_t) -> poly8x8_t; fn vtbx1_s8(a: int8x8_t, b: int8x8_t, c: int8x8_t) -> int8x8_t; fn vtbx1_u8(a: uint8x8_t, b: uint8x8_t, c: uint8x8_t) -> uint8x8_t; fn vtbx1_p8(a: poly8x8_t, b: poly8x8_t, c: uint8x8_t) -> poly8x8_t; fn vtbx2_s8(a: int8x8_t, b: int8x8x2_t, c: int8x8_t) -> int8x8_t; fn vtbx2_u8(a: uint8x8_t, b: uint8x8x2_t, c: uint8x8_t) -> uint8x8_t; fn vtbx2_p8(a: poly8x8_t, b: poly8x8x2_t, c: uint8x8_t) -> poly8x8_t; fn vtbx3_s8(a: int8x8_t, b: int8x8x3_t, c: int8x8_t) -> int8x8_t; fn vtbx3_u8(a: uint8x8_t, b: uint8x8x3_t, c: uint8x8_t) -> uint8x8_t; fn vtbx3_p8(a: poly8x8_t, b: poly8x8x3_t, c: uint8x8_t) -> poly8x8_t; fn vtbx4_s8(a: int8x8_t, b: int8x8x4_t, c: int8x8_t) -> int8x8_t; fn vtbx4_u8(a: uint8x8_t, b: uint8x8x4_t, c: uint8x8_t) -> uint8x8_t; fn vtbx4_p8(a: poly8x8_t, b: poly8x8x4_t, c: uint8x8_t) -> poly8x8_t; fn vqtbl1_s8(t: int8x16_t, idx: uint8x8_t) -> int8x8_t; fn vqtbl1q_s8(t: int8x16_t, idx: uint8x16_t) -> int8x16_t; fn vqtbl1_u8(t: uint8x16_t, idx: uint8x8_t) -> uint8x8_t; fn vqtbl1q_u8(t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t; fn vqtbl1_p8(t: poly8x16_t, idx: uint8x8_t) -> poly8x8_t; fn vqtbl1q_p8(t: poly8x16_t, idx: uint8x16_t) -> poly8x16_t; fn vqtbx1_s8(a: int8x8_t, t: int8x16_t, idx: uint8x8_t) -> int8x8_t; fn vqtbx1q_s8(a: int8x16_t, t: int8x16_t, idx: uint8x16_t) -> int8x16_t; fn vqtbx1_u8(a: uint8x8_t, t: uint8x16_t, idx: uint8x8_t) -> uint8x8_t; fn vqtbx1q_u8(a: uint8x16_t, t: uint8x16_t, idx: uint8x16_t) -> uint8x16_t; fn vqtbx1_p8(a: poly8x8_t, t: poly8x16_t, idx: uint8x8_t) -> poly8x8_t; fn vqtbx1q_p8(a: poly8x16_t, t: poly8x16_t, idx: uint8x16_t) -> poly8x16_t; fn vqtbl2_s8(t: int8x16x2_t, idx: uint8x8_t) -> int8x8_t; fn vqtbl2q_s8(t: int8x16x2_t, idx: uint8x16_t) -> int8x16_t; fn vqtbl2_u8(t: uint8x16x2_t, idx: uint8x8_t) -> uint8x8_t; fn vqtbl2q_u8(t: uint8x16x2_t, idx: uint8x16_t) -> uint8x16_t; fn vqtbl2_p8(t: poly8x16x2_t, idx: uint8x8_t) -> poly8x8_t; fn vqtbl2q_p8(t: poly8x16x2_t, idx: uint8x16_t) -> poly8x16_t; fn vqtbx2_s8(a: int8x8_t, t: int8x16x2_t, idx: uint8x8_t) -> int8x8_t; fn vqtbx2q_s8(a: int8x16_t, t: int8x16x2_t, idx: uint8x16_t) -> int8x16_t; fn vqtbx2_u8(a: uint8x8_t, t: uint8x16x2_t, idx: uint8x8_t) -> uint8x8_t; fn vqtbx2q_u8(a: uint8x16_t, t: uint8x16x2_t, idx: uint8x16_t) -> uint8x16_t; fn vqtbx2_p8(a: poly8x8_t, t: poly8x16x2_t, idx: uint8x8_t) -> poly8x8_t; fn vqtbx2q_p8(a: poly8x16_t, t: poly8x16x2_t, idx: uint8x16_t) -> poly8x16_t; fn vqtbl3_s8(t: int8x16x3_t, idx: uint8x8_t) -> int8x8_t; fn vqtbl3q_s8(t: int8x16x3_t, idx: uint8x16_t) -> int8x16_t; fn vqtbl3_u8(t: uint8x16x3_t, idx: uint8x8_t) -> uint8x8_t; fn vqtbl3q_u8(t: uint8x16x3_t, idx: uint8x16_t) -> uint8x16_t; fn vqtbl3_p8(t: poly8x16x3_t, idx: uint8x8_t) -> poly8x8_t; fn vqtbl3q_p8(t: poly8x16x3_t, idx: uint8x16_t) -> poly8x16_t; fn vqtbx3_s8(a: int8x8_t, t: int8x16x3_t, idx: uint8x8_t) -> int8x8_t; fn vqtbx3q_s8(a: int8x16_t, t: int8x16x3_t, idx: uint8x16_t) -> int8x16_t; fn vqtbx3_u8(a: uint8x8_t, t: uint8x16x3_t, idx: uint8x8_t) -> uint8x8_t; fn vqtbx3q_u8(a: uint8x16_t, t: uint8x16x3_t, idx: uint8x16_t) -> uint8x16_t; fn vqtbx3_p8(a: poly8x8_t, t: poly8x16x3_t, idx: uint8x8_t) -> poly8x8_t; fn vqtbx3q_p8(a: poly8x16_t, t: poly8x16x3_t, idx: uint8x16_t) -> poly8x16_t; fn vqtbl4_s8(t: int8x16x4_t, idx: uint8x8_t) -> int8x8_t; fn vqtbl4q_s8(t: int8x16x4_t, idx: uint8x16_t) -> int8x16_t; fn vqtbl4_u8(t: uint8x16x4_t, idx: uint8x8_t) -> uint8x8_t; fn vqtbl4q_u8(t: uint8x16x4_t, idx: uint8x16_t) -> uint8x16_t; fn vqtbl4_p8(t: poly8x16x4_t, idx: uint8x8_t) -> poly8x8_t; fn vqtbl4q_p8(t: poly8x16x4_t, idx: uint8x16_t) -> poly8x16_t; fn vqtbx4_s8(a: int8x8_t, t: int8x16x4_t, idx: uint8x8_t) -> int8x8_t; fn vqtbx4q_s8(a: int8x16_t, t: int8x16x4_t, idx: uint8x16_t) -> int8x16_t; fn vqtbx4_u8(a: uint8x8_t, t: uint8x16x4_t, idx: uint8x8_t) -> uint8x8_t; fn vqtbx4q_u8(a: uint8x16_t, t: uint8x16x4_t, idx: uint8x16_t) -> uint8x16_t; fn vqtbx4_p8(a: poly8x8_t, t: poly8x16x4_t, idx: uint8x8_t) -> poly8x8_t; fn vqtbx4q_p8(a: poly8x16_t, t: poly8x16x4_t, idx: uint8x16_t) -> poly8x16_t; fn vshld_n_s64(a: i64) -> i64; fn vshld_n_u64(a: u64) -> u64; fn vshrd_n_s64(a: i64) -> i64; fn vshrd_n_u64(a: u64) -> u64; fn vsrad_n_s64(a: i64, b: i64) -> i64; fn vsrad_n_u64(a: u64, b: u64) -> u64; fn vsli_n_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vsliq_n_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vsli_n_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vsliq_n_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vsli_n_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vsliq_n_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vsli_n_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vsliq_n_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vsli_n_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vsliq_n_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vsli_n_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vsliq_n_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vsli_n_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vsliq_n_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vsli_n_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vsliq_n_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vsli_n_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vsliq_n_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vsli_n_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vsliq_n_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; fn vsri_n_s8(a: int8x8_t, b: int8x8_t) -> int8x8_t; fn vsriq_n_s8(a: int8x16_t, b: int8x16_t) -> int8x16_t; fn vsri_n_s16(a: int16x4_t, b: int16x4_t) -> int16x4_t; fn vsriq_n_s16(a: int16x8_t, b: int16x8_t) -> int16x8_t; fn vsri_n_s32(a: int32x2_t, b: int32x2_t) -> int32x2_t; fn vsriq_n_s32(a: int32x4_t, b: int32x4_t) -> int32x4_t; fn vsri_n_s64(a: int64x1_t, b: int64x1_t) -> int64x1_t; fn vsriq_n_s64(a: int64x2_t, b: int64x2_t) -> int64x2_t; fn vsri_n_u8(a: uint8x8_t, b: uint8x8_t) -> uint8x8_t; fn vsriq_n_u8(a: uint8x16_t, b: uint8x16_t) -> uint8x16_t; fn vsri_n_u16(a: uint16x4_t, b: uint16x4_t) -> uint16x4_t; fn vsriq_n_u16(a: uint16x8_t, b: uint16x8_t) -> uint16x8_t; fn vsri_n_u32(a: uint32x2_t, b: uint32x2_t) -> uint32x2_t; fn vsriq_n_u32(a: uint32x4_t, b: uint32x4_t) -> uint32x4_t; fn vsri_n_u64(a: uint64x1_t, b: uint64x1_t) -> uint64x1_t; fn vsriq_n_u64(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t; fn vsri_n_p8(a: poly8x8_t, b: poly8x8_t) -> poly8x8_t; fn vsriq_n_p8(a: poly8x16_t, b: poly8x16_t) -> poly8x16_t; fn vsri_n_p16(a: poly16x4_t, b: poly16x4_t) -> poly16x4_t; fn vsriq_n_p16(a: poly16x8_t, b: poly16x8_t) -> poly16x8_t; } } fearless_simd-0.3.0/src/core_arch/fallback.rs000064400000000000000000000005051046102023000172650ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT /// A token for fallback SIMD. #[derive(Clone, Copy, Debug)] pub struct Fallback { _private: (), } impl Fallback { /// Create a SIMD token. #[inline] pub const fn new() -> Self { Self { _private: () } } } fearless_simd-0.3.0/src/core_arch/mod.rs000064400000000000000000000006021046102023000163030ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to architecture-specific intrinsics. #![expect( missing_docs, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] #[cfg(target_arch = "aarch64")] pub mod aarch64; pub mod fallback; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub mod x86; fearless_simd-0.3.0/src/core_arch/x86/avx.rs000064400000000000000000000350231046102023000167540ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to AVX intrinsics. use crate::impl_macros::delegate; #[cfg(target_arch = "x86")] use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as arch; use arch::*; /// A token for AVX intrinsics on `x86` and `x86_64`. #[derive(Clone, Copy, Debug)] pub struct Avx { _private: (), } #[expect( clippy::missing_safety_doc, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] impl Avx { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. #[inline] pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } delegate! { arch: fn _mm256_add_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_add_ps(a: __m256, b: __m256) -> __m256; fn _mm256_and_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_and_ps(a: __m256, b: __m256) -> __m256; fn _mm256_or_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_or_ps(a: __m256, b: __m256) -> __m256; fn _mm256_shuffle_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_shuffle_ps(a: __m256, b: __m256) -> __m256; fn _mm256_andnot_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_andnot_ps(a: __m256, b: __m256) -> __m256; fn _mm256_max_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_max_ps(a: __m256, b: __m256) -> __m256; fn _mm256_min_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_min_ps(a: __m256, b: __m256) -> __m256; fn _mm256_mul_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_mul_ps(a: __m256, b: __m256) -> __m256; fn _mm256_addsub_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_addsub_ps(a: __m256, b: __m256) -> __m256; fn _mm256_sub_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_sub_ps(a: __m256, b: __m256) -> __m256; fn _mm256_div_ps(a: __m256, b: __m256) -> __m256; fn _mm256_div_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_round_pd(a: __m256d) -> __m256d; fn _mm256_ceil_pd(a: __m256d) -> __m256d; fn _mm256_floor_pd(a: __m256d) -> __m256d; fn _mm256_round_ps(a: __m256) -> __m256; fn _mm256_ceil_ps(a: __m256) -> __m256; fn _mm256_floor_ps(a: __m256) -> __m256; fn _mm256_sqrt_ps(a: __m256) -> __m256; fn _mm256_sqrt_pd(a: __m256d) -> __m256d; fn _mm256_blend_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_blend_ps(a: __m256, b: __m256) -> __m256; fn _mm256_blendv_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; fn _mm256_blendv_ps(a: __m256, b: __m256, c: __m256) -> __m256; fn _mm256_dp_ps(a: __m256, b: __m256) -> __m256; fn _mm256_hadd_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_hadd_ps(a: __m256, b: __m256) -> __m256; fn _mm256_hsub_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_hsub_ps(a: __m256, b: __m256) -> __m256; fn _mm256_xor_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_xor_ps(a: __m256, b: __m256) -> __m256; fn _mm_cmp_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm256_cmp_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm_cmp_ps(a: __m128, b: __m128) -> __m128; fn _mm256_cmp_ps(a: __m256, b: __m256) -> __m256; fn _mm_cmp_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmp_ss(a: __m128, b: __m128) -> __m128; fn _mm256_cvtepi32_pd(a: __m128i) -> __m256d; fn _mm256_cvtepi32_ps(a: __m256i) -> __m256; fn _mm256_cvtpd_ps(a: __m256d) -> __m128; fn _mm256_cvtps_epi32(a: __m256) -> __m256i; fn _mm256_cvtps_pd(a: __m128) -> __m256d; fn _mm256_cvttpd_epi32(a: __m256d) -> __m128i; fn _mm256_cvtpd_epi32(a: __m256d) -> __m128i; fn _mm256_cvttps_epi32(a: __m256) -> __m256i; fn _mm256_extractf128_ps(a: __m256) -> __m128; fn _mm256_extractf128_pd(a: __m256d) -> __m128d; fn _mm256_extractf128_si256(a: __m256i) -> __m128i; fn _mm256_zeroall(); fn _mm256_zeroupper(); fn _mm256_permutevar_ps(a: __m256, b: __m256i) -> __m256; fn _mm_permutevar_ps(a: __m128, b: __m128i) -> __m128; fn _mm256_permute_ps(a: __m256) -> __m256; fn _mm_permute_ps(a: __m128) -> __m128; fn _mm256_permutevar_pd(a: __m256d, b: __m256i) -> __m256d; fn _mm_permutevar_pd(a: __m128d, b: __m128i) -> __m128d; fn _mm256_permute_pd(a: __m256d) -> __m256d; fn _mm_permute_pd(a: __m128d) -> __m128d; fn _mm256_permute2f128_ps(a: __m256, b: __m256) -> __m256; fn _mm256_permute2f128_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_permute2f128_si256(a: __m256i, b: __m256i) -> __m256i; fn _mm256_broadcast_ss(f: &f32) -> __m256; fn _mm_broadcast_ss(f: &f32) -> __m128; fn _mm256_broadcast_sd(f: &f64) -> __m256d; fn _mm256_broadcast_ps(a: &__m128) -> __m256; fn _mm256_broadcast_pd(a: &__m128d) -> __m256d; fn _mm256_insertf128_ps(a: __m256, b: __m128) -> __m256; fn _mm256_insertf128_pd(a: __m256d, b: __m128d) -> __m256d; fn _mm256_insertf128_si256(a: __m256i, b: __m128i) -> __m256i; fn _mm256_insert_epi8(a: __m256i, i: i8) -> __m256i; fn _mm256_insert_epi16(a: __m256i, i: i16) -> __m256i; fn _mm256_insert_epi32(a: __m256i, i: i32) -> __m256i; unsafe fn _mm256_load_pd(mem_addr: *const f64) -> __m256d; unsafe fn _mm256_store_pd(mem_addr: *mut f64, a: __m256d); unsafe fn _mm256_load_ps(mem_addr: *const f32) -> __m256; unsafe fn _mm256_store_ps(mem_addr: *mut f32, a: __m256); unsafe fn _mm256_loadu_pd(mem_addr: *const f64) -> __m256d; unsafe fn _mm256_storeu_pd(mem_addr: *mut f64, a: __m256d); unsafe fn _mm256_loadu_ps(mem_addr: *const f32) -> __m256; unsafe fn _mm256_storeu_ps(mem_addr: *mut f32, a: __m256); unsafe fn _mm256_load_si256(mem_addr: *const __m256i) -> __m256i; unsafe fn _mm256_store_si256(mem_addr: *mut __m256i, a: __m256i); unsafe fn _mm256_loadu_si256(mem_addr: *const __m256i) -> __m256i; unsafe fn _mm256_storeu_si256(mem_addr: *mut __m256i, a: __m256i); unsafe fn _mm256_maskload_pd(mem_addr: *const f64, mask: __m256i) -> __m256d; unsafe fn _mm256_maskstore_pd(mem_addr: *mut f64, mask: __m256i, a: __m256d); unsafe fn _mm_maskload_pd(mem_addr: *const f64, mask: __m128i) -> __m128d; unsafe fn _mm_maskstore_pd(mem_addr: *mut f64, mask: __m128i, a: __m128d); unsafe fn _mm256_maskload_ps(mem_addr: *const f32, mask: __m256i) -> __m256; unsafe fn _mm256_maskstore_ps(mem_addr: *mut f32, mask: __m256i, a: __m256); unsafe fn _mm_maskload_ps(mem_addr: *const f32, mask: __m128i) -> __m128; unsafe fn _mm_maskstore_ps(mem_addr: *mut f32, mask: __m128i, a: __m128); fn _mm256_movehdup_ps(a: __m256) -> __m256; fn _mm256_moveldup_ps(a: __m256) -> __m256; fn _mm256_movedup_pd(a: __m256d) -> __m256d; unsafe fn _mm256_lddqu_si256(mem_addr: *const __m256i) -> __m256i; unsafe fn _mm256_stream_si256(mem_addr: *mut __m256i, a: __m256i); unsafe fn _mm256_stream_pd(mem_addr: *mut f64, a: __m256d); unsafe fn _mm256_stream_ps(mem_addr: *mut f32, a: __m256); fn _mm256_rcp_ps(a: __m256) -> __m256; fn _mm256_rsqrt_ps(a: __m256) -> __m256; fn _mm256_unpackhi_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_unpackhi_ps(a: __m256, b: __m256) -> __m256; fn _mm256_unpacklo_pd(a: __m256d, b: __m256d) -> __m256d; fn _mm256_unpacklo_ps(a: __m256, b: __m256) -> __m256; fn _mm256_testz_si256(a: __m256i, b: __m256i) -> i32; fn _mm256_testc_si256(a: __m256i, b: __m256i) -> i32; fn _mm256_testnzc_si256(a: __m256i, b: __m256i) -> i32; fn _mm256_testz_pd(a: __m256d, b: __m256d) -> i32; fn _mm256_testc_pd(a: __m256d, b: __m256d) -> i32; fn _mm256_testnzc_pd(a: __m256d, b: __m256d) -> i32; fn _mm_testz_pd(a: __m128d, b: __m128d) -> i32; fn _mm_testc_pd(a: __m128d, b: __m128d) -> i32; fn _mm_testnzc_pd(a: __m128d, b: __m128d) -> i32; fn _mm256_testz_ps(a: __m256, b: __m256) -> i32; fn _mm256_testc_ps(a: __m256, b: __m256) -> i32; fn _mm256_testnzc_ps(a: __m256, b: __m256) -> i32; fn _mm_testz_ps(a: __m128, b: __m128) -> i32; fn _mm_testc_ps(a: __m128, b: __m128) -> i32; fn _mm_testnzc_ps(a: __m128, b: __m128) -> i32; fn _mm256_movemask_pd(a: __m256d) -> i32; fn _mm256_movemask_ps(a: __m256) -> i32; fn _mm256_setzero_pd() -> __m256d; fn _mm256_setzero_ps() -> __m256; fn _mm256_setzero_si256() -> __m256i; fn _mm256_set_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d; fn _mm256_set_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256; fn _mm256_set_epi8( e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8, e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8, e16: i8, e17: i8, e18: i8, e19: i8, e20: i8, e21: i8, e22: i8, e23: i8, e24: i8, e25: i8, e26: i8, e27: i8, e28: i8, e29: i8, e30: i8, e31: i8, ) -> __m256i; fn _mm256_set_epi16( e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16, e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16, e14: i16, e15: i16, ) -> __m256i; fn _mm256_set_epi32( e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32, ) -> __m256i; fn _mm256_set_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i; fn _mm256_setr_pd(a: f64, b: f64, c: f64, d: f64) -> __m256d; fn _mm256_setr_ps(a: f32, b: f32, c: f32, d: f32, e: f32, f: f32, g: f32, h: f32) -> __m256; fn _mm256_setr_epi8( e00: i8, e01: i8, e02: i8, e03: i8, e04: i8, e05: i8, e06: i8, e07: i8, e08: i8, e09: i8, e10: i8, e11: i8, e12: i8, e13: i8, e14: i8, e15: i8, e16: i8, e17: i8, e18: i8, e19: i8, e20: i8, e21: i8, e22: i8, e23: i8, e24: i8, e25: i8, e26: i8, e27: i8, e28: i8, e29: i8, e30: i8, e31: i8, ) -> __m256i; fn _mm256_setr_epi16( e00: i16, e01: i16, e02: i16, e03: i16, e04: i16, e05: i16, e06: i16, e07: i16, e08: i16, e09: i16, e10: i16, e11: i16, e12: i16, e13: i16, e14: i16, e15: i16, ) -> __m256i; fn _mm256_setr_epi32( e0: i32, e1: i32, e2: i32, e3: i32, e4: i32, e5: i32, e6: i32, e7: i32, ) -> __m256i; fn _mm256_setr_epi64x(a: i64, b: i64, c: i64, d: i64) -> __m256i; fn _mm256_set1_pd(a: f64) -> __m256d; fn _mm256_set1_ps(a: f32) -> __m256; fn _mm256_set1_epi8(a: i8) -> __m256i; fn _mm256_set1_epi16(a: i16) -> __m256i; fn _mm256_set1_epi32(a: i32) -> __m256i; fn _mm256_set1_epi64x(a: i64) -> __m256i; fn _mm256_castpd_ps(a: __m256d) -> __m256; fn _mm256_castps_pd(a: __m256) -> __m256d; fn _mm256_castps_si256(a: __m256) -> __m256i; fn _mm256_castsi256_ps(a: __m256i) -> __m256; fn _mm256_castpd_si256(a: __m256d) -> __m256i; fn _mm256_castsi256_pd(a: __m256i) -> __m256d; fn _mm256_castps256_ps128(a: __m256) -> __m128; fn _mm256_castpd256_pd128(a: __m256d) -> __m128d; fn _mm256_castsi256_si128(a: __m256i) -> __m128i; fn _mm256_castps128_ps256(a: __m128) -> __m256; fn _mm256_castpd128_pd256(a: __m128d) -> __m256d; fn _mm256_castsi128_si256(a: __m128i) -> __m256i; fn _mm256_zextps128_ps256(a: __m128) -> __m256; fn _mm256_zextsi128_si256(a: __m128i) -> __m256i; fn _mm256_zextpd128_pd256(a: __m128d) -> __m256d; fn _mm256_undefined_ps() -> __m256; fn _mm256_undefined_pd() -> __m256d; fn _mm256_undefined_si256() -> __m256i; fn _mm256_set_m128(hi: __m128, lo: __m128) -> __m256; fn _mm256_set_m128d(hi: __m128d, lo: __m128d) -> __m256d; fn _mm256_set_m128i(hi: __m128i, lo: __m128i) -> __m256i; fn _mm256_setr_m128(lo: __m128, hi: __m128) -> __m256; fn _mm256_setr_m128d(lo: __m128d, hi: __m128d) -> __m256d; fn _mm256_setr_m128i(lo: __m128i, hi: __m128i) -> __m256i; unsafe fn _mm256_loadu2_m128(hiaddr: *const f32, loaddr: *const f32) -> __m256; unsafe fn _mm256_loadu2_m128d(hiaddr: *const f64, loaddr: *const f64) -> __m256d; unsafe fn _mm256_loadu2_m128i(hiaddr: *const __m128i, loaddr: *const __m128i) -> __m256i; unsafe fn _mm256_storeu2_m128(hiaddr: *mut f32, loaddr: *mut f32, a: __m256); unsafe fn _mm256_storeu2_m128d(hiaddr: *mut f64, loaddr: *mut f64, a: __m256d); unsafe fn _mm256_storeu2_m128i(hiaddr: *mut __m128i, loaddr: *mut __m128i, a: __m256i); fn _mm256_cvtss_f32(a: __m256) -> f32; } } fearless_simd-0.3.0/src/core_arch/x86/avx2.rs000064400000000000000000000410171046102023000170360ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT #![expect( clippy::missing_safety_doc, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] //! Access to AVX2 intrinsics. use crate::impl_macros::delegate; #[cfg(target_arch = "x86")] use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as arch; use arch::*; /// A token for AVX2 intrinsics on `x86` and `x86_64`. #[derive(Clone, Copy, Debug)] pub struct Avx2 { _private: (), } impl Avx2 { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } delegate! { arch: fn _mm256_abs_epi32(a: __m256i) -> __m256i; fn _mm256_abs_epi16(a: __m256i) -> __m256i; fn _mm256_abs_epi8(a: __m256i) -> __m256i; fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i; fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_alignr_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i; fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i; fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i; fn _mm_blend_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm256_blend_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_blend_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i; fn _mm_broadcastb_epi8(a: __m128i) -> __m128i; fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i; fn _mm_broadcastd_epi32(a: __m128i) -> __m128i; fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i; fn _mm_broadcastq_epi64(a: __m128i) -> __m128i; fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i; fn _mm_broadcastsd_pd(a: __m128d) -> __m128d; fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d; fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i; fn _mm_broadcastss_ps(a: __m128) -> __m128; fn _mm256_broadcastss_ps(a: __m128) -> __m256; fn _mm_broadcastw_epi16(a: __m128i) -> __m128i; fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i; fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i; fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i; fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i; fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i; fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i; fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i; fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i; fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i; fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i; fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i; fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i; fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i; fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i; fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i; fn _mm256_extracti128_si256(a: __m256i) -> __m128i; fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i; unsafe fn _mm_i32gather_epi32( slice: *const i32, offsets: __m128i, ) -> __m128i; unsafe fn _mm_mask_i32gather_epi32( src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i, ) -> __m128i; unsafe fn _mm256_i32gather_epi32( slice: *const i32, offsets: __m256i, ) -> __m256i; unsafe fn _mm256_mask_i32gather_epi32( src: __m256i, slice: *const i32, offsets: __m256i, mask: __m256i, ) -> __m256i; unsafe fn _mm_i32gather_ps(slice: *const f32, offsets: __m128i) -> __m128; unsafe fn _mm_mask_i32gather_ps( src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, ) -> __m128; unsafe fn _mm256_i32gather_ps( slice: *const f32, offsets: __m256i, ) -> __m256; unsafe fn _mm256_mask_i32gather_ps( src: __m256, slice: *const f32, offsets: __m256i, mask: __m256, ) -> __m256; unsafe fn _mm_i32gather_epi64( slice: *const i64, offsets: __m128i, ) -> __m128i; unsafe fn _mm_mask_i32gather_epi64( src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i, ) -> __m128i; unsafe fn _mm256_i32gather_epi64( slice: *const i64, offsets: __m128i, ) -> __m256i; unsafe fn _mm256_mask_i32gather_epi64( src: __m256i, slice: *const i64, offsets: __m128i, mask: __m256i, ) -> __m256i; unsafe fn _mm_i32gather_pd( slice: *const f64, offsets: __m128i, ) -> __m128d; unsafe fn _mm_mask_i32gather_pd( src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d, ) -> __m128d; unsafe fn _mm256_i32gather_pd( slice: *const f64, offsets: __m128i, ) -> __m256d; unsafe fn _mm256_mask_i32gather_pd( src: __m256d, slice: *const f64, offsets: __m128i, mask: __m256d, ) -> __m256d; unsafe fn _mm_i64gather_epi32( slice: *const i32, offsets: __m128i, ) -> __m128i; unsafe fn _mm_mask_i64gather_epi32( src: __m128i, slice: *const i32, offsets: __m128i, mask: __m128i, ) -> __m128i; unsafe fn _mm256_i64gather_epi32( slice: *const i32, offsets: __m256i, ) -> __m128i; unsafe fn _mm256_mask_i64gather_epi32( src: __m128i, slice: *const i32, offsets: __m256i, mask: __m128i, ) -> __m128i; unsafe fn _mm_i64gather_ps(slice: *const f32, offsets: __m128i) -> __m128; unsafe fn _mm_mask_i64gather_ps( src: __m128, slice: *const f32, offsets: __m128i, mask: __m128, ) -> __m128; unsafe fn _mm256_i64gather_ps( slice: *const f32, offsets: __m256i, ) -> __m128; unsafe fn _mm256_mask_i64gather_ps( src: __m128, slice: *const f32, offsets: __m256i, mask: __m128, ) -> __m128; unsafe fn _mm_i64gather_epi64( slice: *const i64, offsets: __m128i, ) -> __m128i; unsafe fn _mm_mask_i64gather_epi64( src: __m128i, slice: *const i64, offsets: __m128i, mask: __m128i, ) -> __m128i; unsafe fn _mm256_i64gather_epi64( slice: *const i64, offsets: __m256i, ) -> __m256i; unsafe fn _mm256_mask_i64gather_epi64( src: __m256i, slice: *const i64, offsets: __m256i, mask: __m256i, ) -> __m256i; unsafe fn _mm_i64gather_pd( slice: *const f64, offsets: __m128i, ) -> __m128d; unsafe fn _mm_mask_i64gather_pd( src: __m128d, slice: *const f64, offsets: __m128i, mask: __m128d, ) -> __m128d; unsafe fn _mm256_i64gather_pd( slice: *const f64, offsets: __m256i, ) -> __m256d; unsafe fn _mm256_mask_i64gather_pd( src: __m256d, slice: *const f64, offsets: __m256i, mask: __m256d, ) -> __m256d; fn _mm256_inserti128_si256(a: __m256i, b: __m128i) -> __m256i; fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i; unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i; unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i; unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i; unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i; unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i); unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i); unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i); unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i); fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_movemask_epi8(a: __m256i) -> i32; fn _mm256_mpsadbw_epu8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i; fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_permute4x64_epi64(a: __m256i) -> __m256i; fn _mm256_permute2x128_si256(a: __m256i, b: __m256i) -> __m256i; fn _mm256_permute4x64_pd(a: __m256d) -> __m256d; fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256; fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_shuffle_epi32(a: __m256i) -> __m256i; fn _mm256_shufflehi_epi16(a: __m256i) -> __m256i; fn _mm256_shufflelo_epi16(a: __m256i) -> __m256i; fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i; fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i; fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i; fn _mm256_slli_epi16(a: __m256i) -> __m256i; fn _mm256_slli_epi32(a: __m256i) -> __m256i; fn _mm256_slli_epi64(a: __m256i) -> __m256i; fn _mm256_slli_si256(a: __m256i) -> __m256i; fn _mm256_bslli_epi128(a: __m256i) -> __m256i; fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i; fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i; fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i; fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i; fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i; fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i; fn _mm256_srai_epi16(a: __m256i) -> __m256i; fn _mm256_srai_epi32(a: __m256i) -> __m256i; fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i; fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i; fn _mm256_srli_si256(a: __m256i) -> __m256i; fn _mm256_bsrli_epi128(a: __m256i) -> __m256i; fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i; fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i; fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i; fn _mm256_srli_epi16(a: __m256i) -> __m256i; fn _mm256_srli_epi32(a: __m256i) -> __m256i; fn _mm256_srli_epi64(a: __m256i) -> __m256i; fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i; fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i; fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i; fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i; fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i; fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i; fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i; fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i; fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i; fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i; fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i; fn _mm256_extract_epi8(a: __m256i) -> i32; fn _mm256_extract_epi16(a: __m256i) -> i32; fn _mm256_extract_epi32(a: __m256i) -> i32; fn _mm256_cvtsd_f64(a: __m256d) -> f64; fn _mm256_cvtsi256_si32(a: __m256i) -> i32; } } fearless_simd-0.3.0/src/core_arch/x86/fma.rs000064400000000000000000000056461046102023000167310ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to FMA intrinsics. use crate::impl_macros::delegate; #[cfg(target_arch = "x86")] use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as arch; use arch::*; /// A token for FMA intrinsics on `x86` and `x86_64`. #[derive(Clone, Copy, Debug)] pub struct Fma { _private: (), } impl Fma { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. #[inline] pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } delegate! { arch: fn _mm_fmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm256_fmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; fn _mm_fmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128; fn _mm256_fmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256; fn _mm_fmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm_fmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128; fn _mm_fmaddsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm256_fmaddsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; fn _mm_fmaddsub_ps(a: __m128, b: __m128, c: __m128) -> __m128; fn _mm256_fmaddsub_ps(a: __m256, b: __m256, c: __m256) -> __m256; fn _mm_fmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm256_fmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; fn _mm_fmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128; fn _mm256_fmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256; fn _mm_fmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm_fmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128; fn _mm_fmsubadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm256_fmsubadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; fn _mm_fmsubadd_ps(a: __m128, b: __m128, c: __m128) -> __m128; fn _mm256_fmsubadd_ps(a: __m256, b: __m256, c: __m256) -> __m256; fn _mm_fnmadd_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm256_fnmadd_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; fn _mm_fnmadd_ps(a: __m128, b: __m128, c: __m128) -> __m128; fn _mm256_fnmadd_ps(a: __m256, b: __m256, c: __m256) -> __m256; fn _mm_fnmadd_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm_fnmadd_ss(a: __m128, b: __m128, c: __m128) -> __m128; fn _mm_fnmsub_pd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm256_fnmsub_pd(a: __m256d, b: __m256d, c: __m256d) -> __m256d; fn _mm_fnmsub_ps(a: __m128, b: __m128, c: __m128) -> __m128; fn _mm256_fnmsub_ps(a: __m256, b: __m256, c: __m256) -> __m256; fn _mm_fnmsub_sd(a: __m128d, b: __m128d, c: __m128d) -> __m128d; fn _mm_fnmsub_ss(a: __m128, b: __m128, c: __m128) -> __m128; } } fearless_simd-0.3.0/src/core_arch/x86/mod.rs000064400000000000000000000006411046102023000167330ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to intrinsics on `x86` and `x86_64`. mod avx; mod avx2; mod fma; mod sse; mod sse2; mod sse3; mod sse4_1; mod sse4_2; mod ssse3; pub use avx::Avx; pub use avx2::Avx2; pub use fma::Fma; pub use sse::Sse; pub use sse2::Sse2; pub use sse3::Sse3; pub use sse4_1::Sse4_1; pub use sse4_2::Sse4_2; pub use ssse3::Ssse3; fearless_simd-0.3.0/src/core_arch/x86/sse.rs000064400000000000000000000142341046102023000167510ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to SSE intrinsics. use crate::impl_macros::delegate; #[cfg(target_arch = "x86")] use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as arch; use arch::*; /// A token for SSE intrinsics on `x86` and `x86_64`. #[derive(Clone, Copy, Debug)] pub struct Sse { _private: (), } #[expect( clippy::missing_safety_doc, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] impl Sse { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. #[inline] pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } delegate! { arch: fn _mm_add_ss(a: __m128, b: __m128) -> __m128; fn _mm_add_ps(a: __m128, b: __m128) -> __m128; fn _mm_sub_ss(a: __m128, b: __m128) -> __m128; fn _mm_sub_ps(a: __m128, b: __m128) -> __m128; fn _mm_mul_ss(a: __m128, b: __m128) -> __m128; fn _mm_mul_ps(a: __m128, b: __m128) -> __m128; fn _mm_div_ss(a: __m128, b: __m128) -> __m128; fn _mm_div_ps(a: __m128, b: __m128) -> __m128; fn _mm_sqrt_ss(a: __m128) -> __m128; fn _mm_sqrt_ps(a: __m128) -> __m128; fn _mm_rcp_ss(a: __m128) -> __m128; fn _mm_rcp_ps(a: __m128) -> __m128; fn _mm_rsqrt_ss(a: __m128) -> __m128; fn _mm_rsqrt_ps(a: __m128) -> __m128; fn _mm_min_ss(a: __m128, b: __m128) -> __m128; fn _mm_min_ps(a: __m128, b: __m128) -> __m128; fn _mm_max_ss(a: __m128, b: __m128) -> __m128; fn _mm_max_ps(a: __m128, b: __m128) -> __m128; fn _mm_and_ps(a: __m128, b: __m128) -> __m128; fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128; fn _mm_or_ps(a: __m128, b: __m128) -> __m128; fn _mm_xor_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128; fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128; fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128; fn _mm_comieq_ss(a: __m128, b: __m128) -> i32; fn _mm_comilt_ss(a: __m128, b: __m128) -> i32; fn _mm_comile_ss(a: __m128, b: __m128) -> i32; fn _mm_comigt_ss(a: __m128, b: __m128) -> i32; fn _mm_comige_ss(a: __m128, b: __m128) -> i32; fn _mm_comineq_ss(a: __m128, b: __m128) -> i32; fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32; fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32; fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32; fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32; fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32; fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32; fn _mm_cvtss_si32(a: __m128) -> i32; fn _mm_cvt_ss2si(a: __m128) -> i32; fn _mm_cvttss_si32(a: __m128) -> i32; fn _mm_cvtt_ss2si(a: __m128) -> i32; fn _mm_cvtss_f32(a: __m128) -> f32; fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128; fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128; fn _mm_set_ss(a: f32) -> __m128; fn _mm_set1_ps(a: f32) -> __m128; fn _mm_set_ps1(a: f32) -> __m128; fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128; fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128; fn _mm_setzero_ps() -> __m128; fn _mm_shuffle_ps(a: __m128, b: __m128) -> __m128; fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128; fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128; fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128; fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128; fn _mm_movemask_ps(a: __m128) -> i32; unsafe fn _mm_load_ss(p: *const f32) -> __m128; unsafe fn _mm_load1_ps(p: *const f32) -> __m128; unsafe fn _mm_load_ps1(p: *const f32) -> __m128; unsafe fn _mm_load_ps(p: *const f32) -> __m128; unsafe fn _mm_loadu_ps(p: *const f32) -> __m128; unsafe fn _mm_loadr_ps(p: *const f32) -> __m128; unsafe fn _mm_loadu_si64(mem_addr: *const u8) -> __m128i; unsafe fn _mm_store_ss(p: *mut f32, a: __m128); unsafe fn _mm_store1_ps(p: *mut f32, a: __m128); unsafe fn _mm_store_ps1(p: *mut f32, a: __m128); unsafe fn _mm_store_ps(p: *mut f32, a: __m128); unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128); unsafe fn _mm_storer_ps(p: *mut f32, a: __m128); fn _mm_move_ss(a: __m128, b: __m128) -> __m128; fn _mm_sfence(); #[expect(clippy::not_unsafe_ptr_arg_deref, reason="Prefetch has no preconditions, so is valid to accept a pointer.")] fn _mm_prefetch(p: *const i8); fn _mm_undefined_ps() -> __m128; #[allow(non_snake_case)] fn _MM_TRANSPOSE4_PS( row0: &mut __m128, row1: &mut __m128, row2: &mut __m128, row3: &mut __m128, ); unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128); } } fearless_simd-0.3.0/src/core_arch/x86/sse2.rs000064400000000000000000000337131046102023000170360ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to SSE2 intrinsics. use crate::impl_macros::delegate; #[cfg(target_arch = "x86")] use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as arch; use arch::*; /// A token for SSE2 intrinsics on `x86` and `x86_64`. #[derive(Clone, Copy, Debug)] pub struct Sse2 { _private: (), } #[expect( clippy::missing_safety_doc, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] impl Sse2 { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. #[inline] pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } delegate! { arch: fn _mm_pause(); unsafe fn _mm_clflush(p: *const u8); fn _mm_lfence(); fn _mm_mfence(); fn _mm_add_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_add_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_add_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_add_epi64(a: __m128i, b: __m128i) -> __m128i; fn _mm_adds_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_adds_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_adds_epu8(a: __m128i, b: __m128i) -> __m128i; fn _mm_adds_epu16(a: __m128i, b: __m128i) -> __m128i; fn _mm_avg_epu8(a: __m128i, b: __m128i) -> __m128i; fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i; fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_max_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_max_epu8(a: __m128i, b: __m128i) -> __m128i; fn _mm_min_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_min_epu8(a: __m128i, b: __m128i) -> __m128i; fn _mm_mulhi_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_mulhi_epu16(a: __m128i, b: __m128i) -> __m128i; fn _mm_mullo_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_mul_epu32(a: __m128i, b: __m128i) -> __m128i; fn _mm_sad_epu8(a: __m128i, b: __m128i) -> __m128i; fn _mm_sub_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_sub_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_sub_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_sub_epi64(a: __m128i, b: __m128i) -> __m128i; fn _mm_subs_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_subs_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_subs_epu8(a: __m128i, b: __m128i) -> __m128i; fn _mm_subs_epu16(a: __m128i, b: __m128i) -> __m128i; fn _mm_slli_si128(a: __m128i) -> __m128i; fn _mm_bslli_si128(a: __m128i) -> __m128i; fn _mm_bsrli_si128(a: __m128i) -> __m128i; fn _mm_slli_epi16(a: __m128i) -> __m128i; fn _mm_sll_epi16(a: __m128i, count: __m128i) -> __m128i; fn _mm_slli_epi32(a: __m128i) -> __m128i; fn _mm_sll_epi32(a: __m128i, count: __m128i) -> __m128i; fn _mm_slli_epi64(a: __m128i) -> __m128i; fn _mm_sll_epi64(a: __m128i, count: __m128i) -> __m128i; fn _mm_srai_epi16(a: __m128i) -> __m128i; fn _mm_sra_epi16(a: __m128i, count: __m128i) -> __m128i; fn _mm_srai_epi32(a: __m128i) -> __m128i; fn _mm_sra_epi32(a: __m128i, count: __m128i) -> __m128i; fn _mm_srli_si128(a: __m128i) -> __m128i; fn _mm_srli_epi16(a: __m128i) -> __m128i; fn _mm_srl_epi16(a: __m128i, count: __m128i) -> __m128i; fn _mm_srli_epi32(a: __m128i) -> __m128i; fn _mm_srl_epi32(a: __m128i, count: __m128i) -> __m128i; fn _mm_srli_epi64(a: __m128i) -> __m128i; fn _mm_srl_epi64(a: __m128i, count: __m128i) -> __m128i; fn _mm_and_si128(a: __m128i, b: __m128i) -> __m128i; fn _mm_andnot_si128(a: __m128i, b: __m128i) -> __m128i; fn _mm_or_si128(a: __m128i, b: __m128i) -> __m128i; fn _mm_xor_si128(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmpeq_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmpeq_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmpeq_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmpgt_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmpgt_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmpgt_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmplt_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmplt_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmplt_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_cvtepi32_pd(a: __m128i) -> __m128d; fn _mm_cvtsi32_sd(a: __m128d, b: i32) -> __m128d; fn _mm_cvtepi32_ps(a: __m128i) -> __m128; fn _mm_cvtps_epi32(a: __m128) -> __m128i; fn _mm_cvtsi32_si128(a: i32) -> __m128i; fn _mm_cvtsi128_si32(a: __m128i) -> i32; fn _mm_set_epi64x(e1: i64, e0: i64) -> __m128i; fn _mm_set_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i; fn _mm_set_epi16( e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, ) -> __m128i; fn _mm_set_epi8( e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8, e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8, ) -> __m128i; fn _mm_set1_epi64x(a: i64) -> __m128i; fn _mm_set1_epi32(a: i32) -> __m128i; fn _mm_set1_epi16(a: i16) -> __m128i; fn _mm_set1_epi8(a: i8) -> __m128i; fn _mm_setr_epi32(e3: i32, e2: i32, e1: i32, e0: i32) -> __m128i; fn _mm_setr_epi16( e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, ) -> __m128i; fn _mm_setr_epi8( e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8, e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8, ) -> __m128i; fn _mm_setzero_si128() -> __m128i; unsafe fn _mm_loadl_epi64(mem_addr: *const __m128i) -> __m128i; unsafe fn _mm_load_si128(mem_addr: *const __m128i) -> __m128i; unsafe fn _mm_loadu_si128(mem_addr: *const __m128i) -> __m128i; unsafe fn _mm_maskmoveu_si128(a: __m128i, mask: __m128i, mem_addr: *mut i8); unsafe fn _mm_store_si128(mem_addr: *mut __m128i, a: __m128i); unsafe fn _mm_storeu_si128(mem_addr: *mut __m128i, a: __m128i); unsafe fn _mm_storel_epi64(mem_addr: *mut __m128i, a: __m128i); unsafe fn _mm_stream_si128(mem_addr: *mut __m128i, a: __m128i); unsafe fn _mm_stream_si32(mem_addr: *mut i32, a: i32); fn _mm_move_epi64(a: __m128i) -> __m128i; fn _mm_packs_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_packs_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_packus_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_extract_epi16(a: __m128i) -> i32; fn _mm_insert_epi16(a: __m128i, i: i32) -> __m128i; fn _mm_movemask_epi8(a: __m128i) -> i32; fn _mm_shuffle_epi32(a: __m128i) -> __m128i; fn _mm_shufflehi_epi16(a: __m128i) -> __m128i; fn _mm_shufflelo_epi16(a: __m128i) -> __m128i; fn _mm_unpackhi_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_unpackhi_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_unpackhi_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_unpackhi_epi64(a: __m128i, b: __m128i) -> __m128i; fn _mm_unpacklo_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_unpacklo_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_unpacklo_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_unpacklo_epi64(a: __m128i, b: __m128i) -> __m128i; fn _mm_add_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_add_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_div_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_div_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_max_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_max_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_min_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_min_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_mul_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_mul_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_sqrt_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_sqrt_pd(a: __m128d) -> __m128d; fn _mm_sub_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_sub_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_and_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_andnot_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_or_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_xor_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpeq_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmplt_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmple_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpgt_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpge_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpord_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpunord_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpneq_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpnlt_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpnle_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpngt_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpnge_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpeq_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmplt_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmple_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpgt_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpge_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpord_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpunord_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpneq_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpnlt_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpnle_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpngt_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_cmpnge_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_comieq_sd(a: __m128d, b: __m128d) -> i32; fn _mm_comilt_sd(a: __m128d, b: __m128d) -> i32; fn _mm_comile_sd(a: __m128d, b: __m128d) -> i32; fn _mm_comigt_sd(a: __m128d, b: __m128d) -> i32; fn _mm_comige_sd(a: __m128d, b: __m128d) -> i32; fn _mm_comineq_sd(a: __m128d, b: __m128d) -> i32; fn _mm_ucomieq_sd(a: __m128d, b: __m128d) -> i32; fn _mm_ucomilt_sd(a: __m128d, b: __m128d) -> i32; fn _mm_ucomile_sd(a: __m128d, b: __m128d) -> i32; fn _mm_ucomigt_sd(a: __m128d, b: __m128d) -> i32; fn _mm_ucomige_sd(a: __m128d, b: __m128d) -> i32; fn _mm_ucomineq_sd(a: __m128d, b: __m128d) -> i32; fn _mm_cvtpd_ps(a: __m128d) -> __m128; fn _mm_cvtps_pd(a: __m128) -> __m128d; fn _mm_cvtpd_epi32(a: __m128d) -> __m128i; fn _mm_cvtsd_si32(a: __m128d) -> i32; fn _mm_cvtsd_ss(a: __m128, b: __m128d) -> __m128; fn _mm_cvtsd_f64(a: __m128d) -> f64; fn _mm_cvtss_sd(a: __m128d, b: __m128) -> __m128d; fn _mm_cvttpd_epi32(a: __m128d) -> __m128i; fn _mm_cvttsd_si32(a: __m128d) -> i32; fn _mm_cvttps_epi32(a: __m128) -> __m128i; fn _mm_set_sd(a: f64) -> __m128d; fn _mm_set1_pd(a: f64) -> __m128d; fn _mm_set_pd1(a: f64) -> __m128d; fn _mm_set_pd(a: f64, b: f64) -> __m128d; fn _mm_setr_pd(a: f64, b: f64) -> __m128d; fn _mm_setzero_pd() -> __m128d; fn _mm_movemask_pd(a: __m128d) -> i32; unsafe fn _mm_load_pd(mem_addr: *const f64) -> __m128d; unsafe fn _mm_load_sd(mem_addr: *const f64) -> __m128d; unsafe fn _mm_loadh_pd(a: __m128d, mem_addr: *const f64) -> __m128d; unsafe fn _mm_loadl_pd(a: __m128d, mem_addr: *const f64) -> __m128d; unsafe fn _mm_stream_pd(mem_addr: *mut f64, a: __m128d); unsafe fn _mm_store_sd(mem_addr: *mut f64, a: __m128d); unsafe fn _mm_store_pd(mem_addr: *mut f64, a: __m128d); unsafe fn _mm_storeu_pd(mem_addr: *mut f64, a: __m128d); unsafe fn _mm_store1_pd(mem_addr: *mut f64, a: __m128d); unsafe fn _mm_store_pd1(mem_addr: *mut f64, a: __m128d); unsafe fn _mm_storer_pd(mem_addr: *mut f64, a: __m128d); unsafe fn _mm_storeh_pd(mem_addr: *mut f64, a: __m128d); unsafe fn _mm_storel_pd(mem_addr: *mut f64, a: __m128d); unsafe fn _mm_load1_pd(mem_addr: *const f64) -> __m128d; unsafe fn _mm_load_pd1(mem_addr: *const f64) -> __m128d; unsafe fn _mm_loadr_pd(mem_addr: *const f64) -> __m128d; unsafe fn _mm_loadu_pd(mem_addr: *const f64) -> __m128d; fn _mm_shuffle_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_move_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_castpd_ps(a: __m128d) -> __m128; fn _mm_castpd_si128(a: __m128d) -> __m128i; fn _mm_castps_pd(a: __m128) -> __m128d; fn _mm_castps_si128(a: __m128) -> __m128i; fn _mm_castsi128_pd(a: __m128i) -> __m128d; fn _mm_castsi128_ps(a: __m128i) -> __m128; fn _mm_undefined_pd() -> __m128d; fn _mm_undefined_si128() -> __m128i; fn _mm_unpackhi_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_unpacklo_pd(a: __m128d, b: __m128d) -> __m128d; } } fearless_simd-0.3.0/src/core_arch/x86/sse3.rs000064400000000000000000000026261046102023000170360ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to SSE3 intrinsics. use crate::impl_macros::delegate; #[cfg(target_arch = "x86")] use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as arch; use arch::*; /// A token for SSE3 intrinsics on `x86` and `x86_64`. #[derive(Clone, Copy, Debug)] pub struct Sse3 { _private: (), } #[expect( clippy::missing_safety_doc, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] impl Sse3 { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. #[inline] pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } delegate! { arch: fn _mm_addsub_ps(a: __m128, b: __m128) -> __m128; fn _mm_addsub_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_hadd_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_hadd_ps(a: __m128, b: __m128) -> __m128; fn _mm_hsub_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_hsub_ps(a: __m128, b: __m128) -> __m128; unsafe fn _mm_lddqu_si128(mem_addr: *const __m128i) -> __m128i; fn _mm_movedup_pd(a: __m128d) -> __m128d; unsafe fn _mm_loaddup_pd(mem_addr: *const f64) -> __m128d; fn _mm_movehdup_ps(a: __m128) -> __m128; fn _mm_moveldup_ps(a: __m128) -> __m128; } } fearless_simd-0.3.0/src/core_arch/x86/sse4_1.rs000064400000000000000000000103161046102023000172520ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to SSE4.1 intrinsics. use crate::impl_macros::delegate; #[cfg(target_arch = "x86")] use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as arch; use arch::*; /// A token for SSE4.1 intrinsics on `x86` and `x86_64`. #[derive(Clone, Copy, Debug)] pub struct Sse4_1 { _private: (), } impl Sse4_1 { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. #[inline] pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } delegate! { arch: fn _mm_blendv_epi8(a: __m128i, b: __m128i, mask: __m128i) -> __m128i; fn _mm_blend_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_blendv_pd(a: __m128d, b: __m128d, mask: __m128d) -> __m128d; fn _mm_blendv_ps(a: __m128, b: __m128, mask: __m128) -> __m128; fn _mm_blend_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_blend_ps(a: __m128, b: __m128) -> __m128; fn _mm_extract_ps(a: __m128) -> i32; fn _mm_extract_epi8(a: __m128i) -> i32; fn _mm_extract_epi32(a: __m128i) -> i32; fn _mm_insert_ps(a: __m128, b: __m128) -> __m128; fn _mm_insert_epi8(a: __m128i, i: i32) -> __m128i; fn _mm_insert_epi32(a: __m128i, i: i32) -> __m128i; fn _mm_max_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_max_epu16(a: __m128i, b: __m128i) -> __m128i; fn _mm_max_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_max_epu32(a: __m128i, b: __m128i) -> __m128i; fn _mm_min_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_min_epu16(a: __m128i, b: __m128i) -> __m128i; fn _mm_min_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_min_epu32(a: __m128i, b: __m128i) -> __m128i; fn _mm_packus_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmpeq_epi64(a: __m128i, b: __m128i) -> __m128i; fn _mm_cvtepi8_epi16(a: __m128i) -> __m128i; fn _mm_cvtepi8_epi32(a: __m128i) -> __m128i; fn _mm_cvtepi8_epi64(a: __m128i) -> __m128i; fn _mm_cvtepi16_epi32(a: __m128i) -> __m128i; fn _mm_cvtepi16_epi64(a: __m128i) -> __m128i; fn _mm_cvtepi32_epi64(a: __m128i) -> __m128i; fn _mm_cvtepu8_epi16(a: __m128i) -> __m128i; fn _mm_cvtepu8_epi32(a: __m128i) -> __m128i; fn _mm_cvtepu8_epi64(a: __m128i) -> __m128i; fn _mm_cvtepu16_epi32(a: __m128i) -> __m128i; fn _mm_cvtepu16_epi64(a: __m128i) -> __m128i; fn _mm_cvtepu32_epi64(a: __m128i) -> __m128i; fn _mm_dp_pd(a: __m128d, b: __m128d) -> __m128d; fn _mm_dp_ps(a: __m128, b: __m128) -> __m128; fn _mm_floor_pd(a: __m128d) -> __m128d; fn _mm_floor_ps(a: __m128) -> __m128; fn _mm_floor_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_floor_ss(a: __m128, b: __m128) -> __m128; fn _mm_ceil_pd(a: __m128d) -> __m128d; fn _mm_ceil_ps(a: __m128) -> __m128; fn _mm_ceil_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_ceil_ss(a: __m128, b: __m128) -> __m128; fn _mm_round_pd(a: __m128d) -> __m128d; fn _mm_round_ps(a: __m128) -> __m128; fn _mm_round_sd(a: __m128d, b: __m128d) -> __m128d; fn _mm_round_ss(a: __m128, b: __m128) -> __m128; fn _mm_minpos_epu16(a: __m128i) -> __m128i; fn _mm_mul_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_mullo_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_mpsadbw_epu8(a: __m128i, b: __m128i) -> __m128i; fn _mm_testz_si128(a: __m128i, mask: __m128i) -> i32; fn _mm_testc_si128(a: __m128i, mask: __m128i) -> i32; fn _mm_testnzc_si128(a: __m128i, mask: __m128i) -> i32; fn _mm_test_all_zeros(a: __m128i, mask: __m128i) -> i32; fn _mm_test_all_ones(a: __m128i) -> i32; fn _mm_test_mix_ones_zeros(a: __m128i, mask: __m128i) -> i32; } } fearless_simd-0.3.0/src/core_arch/x86/sse4_2.rs000064400000000000000000000040041046102023000172500ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to SSE4.2 intrinsics. use crate::impl_macros::delegate; #[cfg(target_arch = "x86")] use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as arch; use arch::*; /// A token for SSE4.2 intrinsics on `x86` and `x86_64`. #[derive(Clone, Copy, Debug)] pub struct Sse4_2 { _private: (), } impl Sse4_2 { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. #[inline] pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } delegate! { arch: fn _mm_cmpistrm(a: __m128i, b: __m128i) -> __m128i; fn _mm_cmpistri(a: __m128i, b: __m128i) -> i32; fn _mm_cmpistrz(a: __m128i, b: __m128i) -> i32; fn _mm_cmpistrc(a: __m128i, b: __m128i) -> i32; fn _mm_cmpistrs(a: __m128i, b: __m128i) -> i32; fn _mm_cmpistro(a: __m128i, b: __m128i) -> i32; fn _mm_cmpistra(a: __m128i, b: __m128i) -> i32; fn _mm_cmpestrm(a: __m128i, la: i32, b: __m128i, lb: i32) -> __m128i; fn _mm_cmpestri(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32; fn _mm_cmpestrz(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32; fn _mm_cmpestrc(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32; fn _mm_cmpestrs(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32; fn _mm_cmpestro(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32; fn _mm_cmpestra(a: __m128i, la: i32, b: __m128i, lb: i32) -> i32; fn _mm_crc32_u8(crc: u32, v: u8) -> u32; fn _mm_crc32_u16(crc: u32, v: u16) -> u32; fn _mm_crc32_u32(crc: u32, v: u32) -> u32; fn _mm_cmpgt_epi64(a: __m128i, b: __m128i) -> __m128i; } } fearless_simd-0.3.0/src/core_arch/x86/ssse3.rs000064400000000000000000000031701046102023000172140ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Access to SSSE3 intrinsics. use crate::impl_macros::delegate; #[cfg(target_arch = "x86")] use core::arch::x86 as arch; #[cfg(target_arch = "x86_64")] use core::arch::x86_64 as arch; use arch::*; /// A token for SSSE3 intrinsics on `x86` and `x86_64`. #[derive(Clone, Copy, Debug)] pub struct Ssse3 { _private: (), } impl Ssse3 { /// Create a SIMD token. /// /// # Safety /// /// The required CPU features must be available. #[inline] pub const unsafe fn new_unchecked() -> Self { Self { _private: () } } delegate! { arch: fn _mm_abs_epi8(a: __m128i) -> __m128i; fn _mm_abs_epi16(a: __m128i) -> __m128i; fn _mm_abs_epi32(a: __m128i) -> __m128i; fn _mm_shuffle_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_alignr_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_hadd_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_hsub_epi32(a: __m128i, b: __m128i) -> __m128i; fn _mm_maddubs_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_mulhrs_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_sign_epi8(a: __m128i, b: __m128i) -> __m128i; fn _mm_sign_epi16(a: __m128i, b: __m128i) -> __m128i; fn _mm_sign_epi32(a: __m128i, b: __m128i) -> __m128i; } } fearless_simd-0.3.0/src/generated/avx2.rs000064400000000000000000006171761046102023000164410ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT // This file is autogenerated by fearless_simd_gen #![expect( unused_variables, clippy::todo, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, }; #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use core::ops::*; #[doc = r#" The SIMD token for the "AVX2" and "FMA" level."#] #[derive(Clone, Copy, Debug)] pub struct Avx2 { pub avx2: crate::core_arch::x86::Avx2, } impl Avx2 { #[doc = r" Create a SIMD token."] #[doc = r""] #[doc = r" # Safety"] #[doc = r""] #[doc = r" The AVX2 and FMA CPU feature must be available."] #[inline] pub const unsafe fn new_unchecked() -> Self { Avx2 { avx2: unsafe { crate::core_arch::x86::Avx2::new_unchecked() }, } } } impl Seal for Avx2 {} impl Simd for Avx2 { type f32s = f32x4; type u8s = u8x16; type i8s = i8x16; type u16s = u16x8; type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; #[inline(always)] fn level(self) -> Level { Level::Avx2(self) } #[inline] fn vectorize R, R>(self, f: F) -> R { #[target_feature(enable = "avx2,fma")] #[inline] unsafe fn vectorize_avx2 R, R>(f: F) -> R { f() } unsafe { vectorize_avx2(f) } } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { unsafe { _mm_set1_ps(val).simd_into(self) } } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) } } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { let mask = _mm_set1_ps(-0.0); _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self) } } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn madd_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { unsafe { _mm_fmadd_ps(a.into(), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn msub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_floor_ps(a.into()).simd_into(self) } } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { a - a.trunc() } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_round_ps(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) } } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { unsafe { let mask = _mm_castsi128_ps(a.into()); _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, c.into())).simd_into(self) } } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { f64x2 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { i32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { unsafe { _mm_cvtps_epi32(_mm_max_ps(_mm_floor_ps(a.into()), _mm_set1_ps(0.0))).simd_into(self) } } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { unsafe { _mm_cvtps_epi32(a.trunc().into()).simd_into(self) } } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { unsafe { _mm_set1_epi8(val).simd_into(self) } } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { a ^ !0 } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { todo!() } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { unsafe { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift as i32); let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128())); let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128())); let lo_shifted = _mm_sra_epi16(lo_16, shift_count); let hi_shifted = _mm_sra_epi16(hi_16, shift_count); _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) } } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { unsafe { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift as i32); let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128())); let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128())); let lo_shifted = _mm_sll_epi16(lo_16, shift_count); let hi_shifted = _mm_sll_epi16(hi_16, shift_count); _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) } } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmplt_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { unsafe { _mm_set1_epi8(val as _).simd_into(self) } } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { a ^ !0 } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { todo!() } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { unsafe { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift as i32); let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); let lo_shifted = _mm_srl_epi16(lo_16, shift_count); let hi_shifted = _mm_srl_epi16(hi_16, shift_count); _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) } } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { unsafe { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift as i32); let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); let lo_shifted = _mm_sll_epi16(lo_16, shift_count); let hi_shifted = _mm_sll_epi16(hi_16, shift_count); _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) } } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { let sign_bit = _mm_set1_epi8(0x80u8 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { let sign_bit = _mm_set1_epi8(0x80u8 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self) } } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { let sign_bit = _mm_set1_epi8(0x80u8 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { unsafe { let raw = a.into(); let high = _mm_cvtepu8_epi16(raw).simd_into(self); let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self); self.combine_u16x8(high, low) } } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_mask8x16(self, val: i8) -> mask8x16 { unsafe { _mm_set1_epi8(val).simd_into(self) } } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { a ^ !0 } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask8x16( self, a: mask8x16, b: mask8x16, c: mask8x16, ) -> mask8x16 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { unsafe { _mm_set1_epi16(val).simd_into(self) } } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { a ^ !0 } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmplt_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { unsafe { _mm_set1_epi16(val as _).simd_into(self) } } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { a ^ !0 } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { let sign_bit = _mm_set1_epi16(0x8000u16 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { let sign_bit = _mm_set1_epi16(0x8000u16 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self) } } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { let sign_bit = _mm_set1_epi16(0x8000u16 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_mask16x8(self, val: i16) -> mask16x8 { unsafe { _mm_set1_epi16(val).simd_into(self) } } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { a ^ !0 } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask16x8( self, a: mask16x8, b: mask16x8, c: mask16x8, ) -> mask16x8 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { unsafe { _mm_set1_epi32(val).simd_into(self) } } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { a ^ !0 } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_srav_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmplt_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); _mm_unpackhi_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { unsafe { _mm_set1_epi32(val as _).simd_into(self) } } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { a ^ !0 } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_srlv_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { let sign_bit = _mm_set1_epi32(0x80000000u32 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { let sign_bit = _mm_set1_epi32(0x80000000u32 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self) } } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { let sign_bit = _mm_set1_epi32(0x80000000u32 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); _mm_unpackhi_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } } #[inline(always)] fn splat_mask32x4(self, val: i32) -> mask32x4 { unsafe { _mm_set1_epi32(val).simd_into(self) } } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { a ^ !0 } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask32x4( self, a: mask32x4, b: mask32x4, c: mask32x4, ) -> mask32x4 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { unsafe { _mm_set1_pd(val).simd_into(self) } } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) } } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { let mask = _mm_set1_pd(-0.0); _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self) } } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn madd_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { unsafe { _mm_fmadd_pd(a.into(), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn msub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_floor_pd(a.into()).simd_into(self) } } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { a - a.trunc() } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_round_pd(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) } } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { unsafe { let mask = _mm_castsi128_pd(a.into()); _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, c.into())).simd_into(self) } } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { let mut result = [0.0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { f32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_mask64x2(self, val: i64) -> mask64x2 { unsafe { _mm_set1_epi64x(val).simd_into(self) } } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { a ^ !0 } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask64x2( self, a: mask64x2, b: mask64x2, c: mask64x2, ) -> mask64x2 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { let mut result = [0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f32x8(self, a: f32) -> f32x8 { let half = self.splat_f32x4(a); self.combine_f32x4(half, half) } #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) } #[inline(always)] fn neg_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) } #[inline(always)] fn sqrt_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) } #[inline(always)] fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) } #[inline(always)] fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) } #[inline(always)] fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) } #[inline(always)] fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) } #[inline(always)] fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) } #[inline(always)] fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) } #[inline(always)] fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) } #[inline(always)] fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) } #[inline(always)] fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) } #[inline(always)] fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, _) = self.split_f32x8(a); let (b0, _) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) } #[inline(always)] fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (_, a1) = self.split_f32x8(a); let (_, b1) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) } #[inline(always)] fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) } #[inline(always)] fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) } #[inline(always)] fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) } #[inline(always)] fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.max_precise_f32x4(a0, b0), self.max_precise_f32x4(a1, b1), ) } #[inline(always)] fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.min_precise_f32x4(a0, b0), self.min_precise_f32x4(a1, b1), ) } #[inline(always)] fn madd_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1)) } #[inline(always)] fn msub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1)) } #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) } #[inline(always)] fn fract_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) } #[inline(always)] fn trunc_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) } #[inline(always)] fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) } #[inline(always)] fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { let mut result = [0.0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { let (a0, a1) = self.split_f32x8(a); self.combine_f64x2( self.reinterpret_f64_f32x4(a0), self.reinterpret_f64_f32x4(a1), ) } #[inline(always)] fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4( self.reinterpret_i32_f32x4(a0), self.reinterpret_i32_f32x4(a1), ) } #[inline(always)] fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { let (a0, a1) = self.split_f32x8(a); self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) } #[inline(always)] fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4( self.reinterpret_u32_f32x4(a0), self.reinterpret_u32_f32x4(a1), ) } #[inline(always)] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) } #[inline(always)] fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) } #[inline(always)] fn splat_i8x32(self, a: i8) -> i8x32 { let half = self.splat_i8x16(a); self.combine_i8x16(half, half) } #[inline(always)] fn not_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) } #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) } #[inline(always)] fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) } #[inline(always)] fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) } #[inline(always)] fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) } #[inline(always)] fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) } #[inline(always)] fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) } #[inline(always)] fn shr_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b)) } #[inline(always)] fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) } #[inline(always)] fn shl_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shl_i8x16(a0, b), self.shl_i8x16(a1, b)) } #[inline(always)] fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) } #[inline(always)] fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) } #[inline(always)] fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) } #[inline(always)] fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) } #[inline(always)] fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) } #[inline(always)] fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, _) = self.split_i8x32(a); let (b0, _) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) } #[inline(always)] fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (_, a1) = self.split_i8x32(a); let (_, b1) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) } #[inline(always)] fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) } #[inline(always)] fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) } #[inline(always)] fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_i8x32(b); let (c0, c1) = self.split_i8x32(c); self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) } #[inline(always)] fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) } #[inline(always)] fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) } #[inline(always)] fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) } #[inline(always)] fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) } #[inline(always)] fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { let (a0, a1) = self.split_i8x32(a); self.combine_u32x4( self.reinterpret_u32_i8x16(a0), self.reinterpret_u32_i8x16(a1), ) } #[inline(always)] fn splat_u8x32(self, a: u8) -> u8x32 { let half = self.splat_u8x16(a); self.combine_u8x16(half, half) } #[inline(always)] fn not_u8x32(self, a: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) } #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) } #[inline(always)] fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) } #[inline(always)] fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) } #[inline(always)] fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) } #[inline(always)] fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) } #[inline(always)] fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) } #[inline(always)] fn shr_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b)) } #[inline(always)] fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) } #[inline(always)] fn shl_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shl_u8x16(a0, b), self.shl_u8x16(a1, b)) } #[inline(always)] fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) } #[inline(always)] fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) } #[inline(always)] fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) } #[inline(always)] fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) } #[inline(always)] fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) } #[inline(always)] fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, _) = self.split_u8x32(a); let (b0, _) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) } #[inline(always)] fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (_, a1) = self.split_u8x32(a); let (_, b1) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) } #[inline(always)] fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) } #[inline(always)] fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) } #[inline(always)] fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_u8x32(b); let (c0, c1) = self.split_u8x32(c); self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) } #[inline(always)] fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) } #[inline(always)] fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) } #[inline(always)] fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn widen_u8x32(self, a: u8x32) -> u16x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) } #[inline(always)] fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { let (a0, a1) = self.split_u8x32(a); self.combine_u32x4( self.reinterpret_u32_u8x16(a0), self.reinterpret_u32_u8x16(a1), ) } #[inline(always)] fn splat_mask8x32(self, a: i8) -> mask8x32 { let half = self.splat_mask8x16(a); self.combine_mask8x16(half, half) } #[inline(always)] fn not_mask8x32(self, a: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) } #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) } #[inline(always)] fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) } #[inline(always)] fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) } #[inline(always)] fn select_mask8x32( self, a: mask8x32, b: mask8x32, c: mask8x32, ) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); let (c0, c1) = self.split_mask8x32(c); self.combine_mask8x16( self.select_mask8x16(a0, b0, c0), self.select_mask8x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) } #[inline(always)] fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x16(self, a: i16) -> i16x16 { let half = self.splat_i16x8(a); self.combine_i16x8(half, half) } #[inline(always)] fn not_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) } #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) } #[inline(always)] fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) } #[inline(always)] fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) } #[inline(always)] fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) } #[inline(always)] fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) } #[inline(always)] fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) } #[inline(always)] fn shr_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b)) } #[inline(always)] fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) } #[inline(always)] fn shl_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shl_i16x8(a0, b), self.shl_i16x8(a1, b)) } #[inline(always)] fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) } #[inline(always)] fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) } #[inline(always)] fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) } #[inline(always)] fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) } #[inline(always)] fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) } #[inline(always)] fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, _) = self.split_i16x16(a); let (b0, _) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) } #[inline(always)] fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (_, a1) = self.split_i16x16(a); let (_, b1) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) } #[inline(always)] fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) } #[inline(always)] fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) } #[inline(always)] fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_i16x16(b); let (c0, c1) = self.split_i16x16(c); self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) } #[inline(always)] fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) } #[inline(always)] fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) } #[inline(always)] fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) } #[inline(always)] fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { let (a0, a1) = self.split_i16x16(a); self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) } #[inline(always)] fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { let (a0, a1) = self.split_i16x16(a); self.combine_u32x4( self.reinterpret_u32_i16x8(a0), self.reinterpret_u32_i16x8(a1), ) } #[inline(always)] fn splat_u16x16(self, a: u16) -> u16x16 { let half = self.splat_u16x8(a); self.combine_u16x8(half, half) } #[inline(always)] fn not_u16x16(self, a: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) } #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) } #[inline(always)] fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) } #[inline(always)] fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) } #[inline(always)] fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) } #[inline(always)] fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) } #[inline(always)] fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) } #[inline(always)] fn shr_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b)) } #[inline(always)] fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) } #[inline(always)] fn shl_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shl_u16x8(a0, b), self.shl_u16x8(a1, b)) } #[inline(always)] fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) } #[inline(always)] fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) } #[inline(always)] fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) } #[inline(always)] fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) } #[inline(always)] fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) } #[inline(always)] fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, _) = self.split_u16x16(a); let (b0, _) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) } #[inline(always)] fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (_, a1) = self.split_u16x16(a); let (_, b1) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) } #[inline(always)] fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) } #[inline(always)] fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) } #[inline(always)] fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_u16x16(b); let (c0, c1) = self.split_u16x16(c); self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) } #[inline(always)] fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) } #[inline(always)] fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) } #[inline(always)] fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { let (a, b) = self.split_u16x16(a); unsafe { let mask = _mm_set1_epi16(0xFF); let lo_masked = _mm_and_si128(a.into(), mask); let hi_masked = _mm_and_si128(b.into(), mask); let result = _mm_packus_epi16(lo_masked, hi_masked); result.simd_into(self) } } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { let (a0, a1) = self.split_u16x16(a); self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) } #[inline(always)] fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { let (a0, a1) = self.split_u16x16(a); self.combine_u32x4( self.reinterpret_u32_u16x8(a0), self.reinterpret_u32_u16x8(a1), ) } #[inline(always)] fn splat_mask16x16(self, a: i16) -> mask16x16 { let half = self.splat_mask16x8(a); self.combine_mask16x8(half, half) } #[inline(always)] fn not_mask16x16(self, a: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) } #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) } #[inline(always)] fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) } #[inline(always)] fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) } #[inline(always)] fn select_mask16x16( self, a: mask16x16, b: mask16x16, c: mask16x16, ) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); let (c0, c1) = self.split_mask16x16(c); self.combine_mask16x8( self.select_mask16x8(a0, b0, c0), self.select_mask16x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) } #[inline(always)] fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x8(self, a: i32) -> i32x8 { let half = self.splat_i32x4(a); self.combine_i32x4(half, half) } #[inline(always)] fn not_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) } #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) } #[inline(always)] fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) } #[inline(always)] fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) } #[inline(always)] fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) } #[inline(always)] fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) } #[inline(always)] fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) } #[inline(always)] fn shr_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b)) } #[inline(always)] fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) } #[inline(always)] fn shl_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shl_i32x4(a0, b), self.shl_i32x4(a1, b)) } #[inline(always)] fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) } #[inline(always)] fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) } #[inline(always)] fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) } #[inline(always)] fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) } #[inline(always)] fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) } #[inline(always)] fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, _) = self.split_i32x8(a); let (b0, _) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) } #[inline(always)] fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (_, a1) = self.split_i32x8(a); let (_, b1) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) } #[inline(always)] fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) } #[inline(always)] fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) } #[inline(always)] fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_i32x8(b); let (c0, c1) = self.split_i32x8(c); self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) } #[inline(always)] fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) } #[inline(always)] fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) } #[inline(always)] fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) } #[inline(always)] fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { let (a0, a1) = self.split_i32x8(a); self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) } #[inline(always)] fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_u32x4( self.reinterpret_u32_i32x4(a0), self.reinterpret_u32_i32x4(a1), ) } #[inline(always)] fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) } #[inline(always)] fn splat_u32x8(self, a: u32) -> u32x8 { let half = self.splat_u32x4(a); self.combine_u32x4(half, half) } #[inline(always)] fn not_u32x8(self, a: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) } #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) } #[inline(always)] fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) } #[inline(always)] fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) } #[inline(always)] fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) } #[inline(always)] fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) } #[inline(always)] fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) } #[inline(always)] fn shr_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b)) } #[inline(always)] fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) } #[inline(always)] fn shl_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shl_u32x4(a0, b), self.shl_u32x4(a1, b)) } #[inline(always)] fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) } #[inline(always)] fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) } #[inline(always)] fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) } #[inline(always)] fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) } #[inline(always)] fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) } #[inline(always)] fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, _) = self.split_u32x8(a); let (b0, _) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) } #[inline(always)] fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (_, a1) = self.split_u32x8(a); let (_, b1) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) } #[inline(always)] fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) } #[inline(always)] fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) } #[inline(always)] fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_u32x8(b); let (c0, c1) = self.split_u32x8(c); self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) } #[inline(always)] fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) } #[inline(always)] fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) } #[inline(always)] fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { let (a0, a1) = self.split_u32x8(a); self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) } #[inline(always)] fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) } #[inline(always)] fn splat_mask32x8(self, a: i32) -> mask32x8 { let half = self.splat_mask32x4(a); self.combine_mask32x4(half, half) } #[inline(always)] fn not_mask32x8(self, a: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) } #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) } #[inline(always)] fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) } #[inline(always)] fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) } #[inline(always)] fn select_mask32x8( self, a: mask32x8, b: mask32x8, c: mask32x8, ) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); let (c0, c1) = self.split_mask32x8(c); self.combine_mask32x4( self.select_mask32x4(a0, b0, c0), self.select_mask32x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) } #[inline(always)] fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x4(self, a: f64) -> f64x4 { let half = self.splat_f64x2(a); self.combine_f64x2(half, half) } #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) } #[inline(always)] fn neg_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) } #[inline(always)] fn sqrt_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) } #[inline(always)] fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) } #[inline(always)] fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) } #[inline(always)] fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) } #[inline(always)] fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) } #[inline(always)] fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) } #[inline(always)] fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) } #[inline(always)] fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) } #[inline(always)] fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) } #[inline(always)] fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) } #[inline(always)] fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, _) = self.split_f64x4(a); let (b0, _) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) } #[inline(always)] fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (_, a1) = self.split_f64x4(a); let (_, b1) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) } #[inline(always)] fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) } #[inline(always)] fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) } #[inline(always)] fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) } #[inline(always)] fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.max_precise_f64x2(a0, b0), self.max_precise_f64x2(a1, b1), ) } #[inline(always)] fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) } #[inline(always)] fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.min_precise_f64x2(a0, b0), self.min_precise_f64x2(a1, b1), ) } #[inline(always)] fn madd_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1)) } #[inline(always)] fn msub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1)) } #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) } #[inline(always)] fn fract_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) } #[inline(always)] fn trunc_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) } #[inline(always)] fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) } #[inline(always)] fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { let mut b0 = [0.0; 2usize]; let mut b1 = [0.0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { let (a0, a1) = self.split_f64x4(a); self.combine_f32x4( self.reinterpret_f32_f64x2(a0), self.reinterpret_f32_f64x2(a1), ) } #[inline(always)] fn splat_mask64x4(self, a: i64) -> mask64x4 { let half = self.splat_mask64x2(a); self.combine_mask64x2(half, half) } #[inline(always)] fn not_mask64x4(self, a: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) } #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) } #[inline(always)] fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) } #[inline(always)] fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) } #[inline(always)] fn select_mask64x4( self, a: mask64x4, b: mask64x4, c: mask64x4, ) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); let (c0, c1) = self.split_mask64x4(c); self.combine_mask64x2( self.select_mask64x2(a0, b0, c0), self.select_mask64x2(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) } #[inline(always)] fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { let mut b0 = [0; 2usize]; let mut b1 = [0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f32x16(self, a: f32) -> f32x16 { let half = self.splat_f32x8(a); self.combine_f32x8(half, half) } #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) } #[inline(always)] fn neg_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) } #[inline(always)] fn sqrt_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) } #[inline(always)] fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) } #[inline(always)] fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) } #[inline(always)] fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) } #[inline(always)] fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) } #[inline(always)] fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) } #[inline(always)] fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) } #[inline(always)] fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) } #[inline(always)] fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) } #[inline(always)] fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) } #[inline(always)] fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, _) = self.split_f32x16(a); let (b0, _) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) } #[inline(always)] fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (_, a1) = self.split_f32x16(a); let (_, b1) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) } #[inline(always)] fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) } #[inline(always)] fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) } #[inline(always)] fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) } #[inline(always)] fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.max_precise_f32x8(a0, b0), self.max_precise_f32x8(a1, b1), ) } #[inline(always)] fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) } #[inline(always)] fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.min_precise_f32x8(a0, b0), self.min_precise_f32x8(a1, b1), ) } #[inline(always)] fn madd_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1)) } #[inline(always)] fn msub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1)) } #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) } #[inline(always)] fn fract_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) } #[inline(always)] fn trunc_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) } #[inline(always)] fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) } #[inline(always)] fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { let mut b0 = [0.0; 8usize]; let mut b1 = [0.0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { let (a0, a1) = self.split_f32x16(a); self.combine_f64x4( self.reinterpret_f64_f32x8(a0), self.reinterpret_f64_f32x8(a1), ) } #[inline(always)] fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8( self.reinterpret_i32_f32x8(a0), self.reinterpret_i32_f32x8(a1), ) } #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { crate::Fallback::new() .load_interleaved_128_f32x16(src) .val .simd_into(self) } #[inline(always)] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { let fb = crate::Fallback::new(); fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest); } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) } #[inline(always)] fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8( self.reinterpret_u32_f32x8(a0), self.reinterpret_u32_f32x8(a1), ) } #[inline(always)] fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) } #[inline(always)] fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) } #[inline(always)] fn splat_i8x64(self, a: i8) -> i8x64 { let half = self.splat_i8x32(a); self.combine_i8x32(half, half) } #[inline(always)] fn not_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) } #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) } #[inline(always)] fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) } #[inline(always)] fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) } #[inline(always)] fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) } #[inline(always)] fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) } #[inline(always)] fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) } #[inline(always)] fn shr_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b)) } #[inline(always)] fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) } #[inline(always)] fn shl_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shl_i8x32(a0, b), self.shl_i8x32(a1, b)) } #[inline(always)] fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) } #[inline(always)] fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) } #[inline(always)] fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) } #[inline(always)] fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) } #[inline(always)] fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) } #[inline(always)] fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, _) = self.split_i8x64(a); let (b0, _) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) } #[inline(always)] fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (_, a1) = self.split_i8x64(a); let (_, b1) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) } #[inline(always)] fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) } #[inline(always)] fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) } #[inline(always)] fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_i8x64(b); let (c0, c1) = self.split_i8x64(c); self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) } #[inline(always)] fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) } #[inline(always)] fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) } #[inline(always)] fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) } #[inline(always)] fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) } #[inline(always)] fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { let (a0, a1) = self.split_i8x64(a); self.combine_u32x8( self.reinterpret_u32_i8x32(a0), self.reinterpret_u32_i8x32(a1), ) } #[inline(always)] fn splat_u8x64(self, a: u8) -> u8x64 { let half = self.splat_u8x32(a); self.combine_u8x32(half, half) } #[inline(always)] fn not_u8x64(self, a: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) } #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) } #[inline(always)] fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) } #[inline(always)] fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) } #[inline(always)] fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) } #[inline(always)] fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) } #[inline(always)] fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) } #[inline(always)] fn shr_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b)) } #[inline(always)] fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) } #[inline(always)] fn shl_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shl_u8x32(a0, b), self.shl_u8x32(a1, b)) } #[inline(always)] fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) } #[inline(always)] fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) } #[inline(always)] fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) } #[inline(always)] fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) } #[inline(always)] fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) } #[inline(always)] fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, _) = self.split_u8x64(a); let (b0, _) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) } #[inline(always)] fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (_, a1) = self.split_u8x64(a); let (_, b1) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) } #[inline(always)] fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) } #[inline(always)] fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) } #[inline(always)] fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_u8x64(b); let (c0, c1) = self.split_u8x64(c); self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) } #[inline(always)] fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) } #[inline(always)] fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) } #[inline(always)] fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { crate::Fallback::new() .load_interleaved_128_u8x64(src) .val .simd_into(self) } #[inline(always)] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { let fb = crate::Fallback::new(); fb.store_interleaved_128_u8x64(a.val.simd_into(fb), dest); } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { let (a0, a1) = self.split_u8x64(a); self.combine_u32x8( self.reinterpret_u32_u8x32(a0), self.reinterpret_u32_u8x32(a1), ) } #[inline(always)] fn splat_mask8x64(self, a: i8) -> mask8x64 { let half = self.splat_mask8x32(a); self.combine_mask8x32(half, half) } #[inline(always)] fn not_mask8x64(self, a: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) } #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) } #[inline(always)] fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) } #[inline(always)] fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) } #[inline(always)] fn select_mask8x64( self, a: mask8x64, b: mask8x64, c: mask8x64, ) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); let (c0, c1) = self.split_mask8x64(c); self.combine_mask8x32( self.select_mask8x32(a0, b0, c0), self.select_mask8x32(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) } #[inline(always)] fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x32(self, a: i16) -> i16x32 { let half = self.splat_i16x16(a); self.combine_i16x16(half, half) } #[inline(always)] fn not_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) } #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) } #[inline(always)] fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) } #[inline(always)] fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) } #[inline(always)] fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) } #[inline(always)] fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) } #[inline(always)] fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) } #[inline(always)] fn shr_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b)) } #[inline(always)] fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) } #[inline(always)] fn shl_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shl_i16x16(a0, b), self.shl_i16x16(a1, b)) } #[inline(always)] fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) } #[inline(always)] fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) } #[inline(always)] fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) } #[inline(always)] fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) } #[inline(always)] fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) } #[inline(always)] fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, _) = self.split_i16x32(a); let (b0, _) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) } #[inline(always)] fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (_, a1) = self.split_i16x32(a); let (_, b1) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) } #[inline(always)] fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) } #[inline(always)] fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16( self.unzip_high_i16x16(a0, a1), self.unzip_high_i16x16(b0, b1), ) } #[inline(always)] fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_i16x32(b); let (c0, c1) = self.split_i16x32(c); self.combine_i16x16( self.select_i16x16(a0, b0, c0), self.select_i16x16(a1, b1, c1), ) } #[inline(always)] fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) } #[inline(always)] fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) } #[inline(always)] fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) } #[inline(always)] fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { let (a0, a1) = self.split_i16x32(a); self.combine_u8x32( self.reinterpret_u8_i16x16(a0), self.reinterpret_u8_i16x16(a1), ) } #[inline(always)] fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { let (a0, a1) = self.split_i16x32(a); self.combine_u32x8( self.reinterpret_u32_i16x16(a0), self.reinterpret_u32_i16x16(a1), ) } #[inline(always)] fn splat_u16x32(self, a: u16) -> u16x32 { let half = self.splat_u16x16(a); self.combine_u16x16(half, half) } #[inline(always)] fn not_u16x32(self, a: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) } #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) } #[inline(always)] fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) } #[inline(always)] fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) } #[inline(always)] fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) } #[inline(always)] fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) } #[inline(always)] fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) } #[inline(always)] fn shr_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b)) } #[inline(always)] fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) } #[inline(always)] fn shl_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shl_u16x16(a0, b), self.shl_u16x16(a1, b)) } #[inline(always)] fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) } #[inline(always)] fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) } #[inline(always)] fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) } #[inline(always)] fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) } #[inline(always)] fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) } #[inline(always)] fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, _) = self.split_u16x32(a); let (b0, _) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) } #[inline(always)] fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (_, a1) = self.split_u16x32(a); let (_, b1) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) } #[inline(always)] fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) } #[inline(always)] fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16( self.unzip_high_u16x16(a0, a1), self.unzip_high_u16x16(b0, b1), ) } #[inline(always)] fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_u16x32(b); let (c0, c1) = self.split_u16x32(c); self.combine_u16x16( self.select_u16x16(a0, b0, c0), self.select_u16x16(a1, b1, c1), ) } #[inline(always)] fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) } #[inline(always)] fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) } #[inline(always)] fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { crate::Fallback::new() .load_interleaved_128_u16x32(src) .val .simd_into(self) } #[inline(always)] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { let fb = crate::Fallback::new(); fb.store_interleaved_128_u16x32(a.val.simd_into(fb), dest); } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) } #[inline(always)] fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x32( self.reinterpret_u8_u16x16(a0), self.reinterpret_u8_u16x16(a1), ) } #[inline(always)] fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { let (a0, a1) = self.split_u16x32(a); self.combine_u32x8( self.reinterpret_u32_u16x16(a0), self.reinterpret_u32_u16x16(a1), ) } #[inline(always)] fn splat_mask16x32(self, a: i16) -> mask16x32 { let half = self.splat_mask16x16(a); self.combine_mask16x16(half, half) } #[inline(always)] fn not_mask16x32(self, a: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) } #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) } #[inline(always)] fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) } #[inline(always)] fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) } #[inline(always)] fn select_mask16x32( self, a: mask16x32, b: mask16x32, c: mask16x32, ) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); let (c0, c1) = self.split_mask16x32(c); self.combine_mask16x16( self.select_mask16x16(a0, b0, c0), self.select_mask16x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16( self.simd_eq_mask16x16(a0, b0), self.simd_eq_mask16x16(a1, b1), ) } #[inline(always)] fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x16(self, a: i32) -> i32x16 { let half = self.splat_i32x8(a); self.combine_i32x8(half, half) } #[inline(always)] fn not_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) } #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) } #[inline(always)] fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) } #[inline(always)] fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) } #[inline(always)] fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) } #[inline(always)] fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) } #[inline(always)] fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) } #[inline(always)] fn shr_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b)) } #[inline(always)] fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) } #[inline(always)] fn shl_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shl_i32x8(a0, b), self.shl_i32x8(a1, b)) } #[inline(always)] fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) } #[inline(always)] fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) } #[inline(always)] fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) } #[inline(always)] fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) } #[inline(always)] fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) } #[inline(always)] fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, _) = self.split_i32x16(a); let (b0, _) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) } #[inline(always)] fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (_, a1) = self.split_i32x16(a); let (_, b1) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) } #[inline(always)] fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) } #[inline(always)] fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) } #[inline(always)] fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_i32x16(b); let (c0, c1) = self.split_i32x16(c); self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) } #[inline(always)] fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) } #[inline(always)] fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) } #[inline(always)] fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) } #[inline(always)] fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { let (a0, a1) = self.split_i32x16(a); self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) } #[inline(always)] fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_u32x8( self.reinterpret_u32_i32x8(a0), self.reinterpret_u32_i32x8(a1), ) } #[inline(always)] fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) } #[inline(always)] fn splat_u32x16(self, a: u32) -> u32x16 { let half = self.splat_u32x8(a); self.combine_u32x8(half, half) } #[inline(always)] fn not_u32x16(self, a: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) } #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) } #[inline(always)] fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) } #[inline(always)] fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) } #[inline(always)] fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) } #[inline(always)] fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) } #[inline(always)] fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) } #[inline(always)] fn shr_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b)) } #[inline(always)] fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) } #[inline(always)] fn shl_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shl_u32x8(a0, b), self.shl_u32x8(a1, b)) } #[inline(always)] fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) } #[inline(always)] fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) } #[inline(always)] fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) } #[inline(always)] fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) } #[inline(always)] fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) } #[inline(always)] fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, _) = self.split_u32x16(a); let (b0, _) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) } #[inline(always)] fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (_, a1) = self.split_u32x16(a); let (_, b1) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) } #[inline(always)] fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) } #[inline(always)] fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) } #[inline(always)] fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_u32x16(b); let (c0, c1) = self.split_u32x16(c); self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) } #[inline(always)] fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) } #[inline(always)] fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) } #[inline(always)] fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { unsafe { let v0 = _mm_loadu_si128(src.as_ptr().add(0) as *const __m128i); let v1 = _mm_loadu_si128(src.as_ptr().add(4) as *const __m128i); let v2 = _mm_loadu_si128(src.as_ptr().add(8) as *const __m128i); let v3 = _mm_loadu_si128(src.as_ptr().add(12) as *const __m128i); let tmp0 = _mm_unpacklo_epi32(v0, v1); let tmp1 = _mm_unpackhi_epi32(v0, v1); let tmp2 = _mm_unpacklo_epi32(v2, v3); let tmp3 = _mm_unpackhi_epi32(v2, v3); let out0 = _mm_unpacklo_epi64(tmp0, tmp2); let out1 = _mm_unpackhi_epi64(tmp0, tmp2); let out2 = _mm_unpacklo_epi64(tmp1, tmp3); let out3 = _mm_unpackhi_epi64(tmp1, tmp3); self.combine_u32x8( self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)), self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)), ) } } #[inline(always)] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { let fb = crate::Fallback::new(); fb.store_interleaved_128_u32x16(a.val.simd_into(fb), dest); } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { let (a0, a1) = self.split_u32x16(a); self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) } #[inline(always)] fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) } #[inline(always)] fn splat_mask32x16(self, a: i32) -> mask32x16 { let half = self.splat_mask32x8(a); self.combine_mask32x8(half, half) } #[inline(always)] fn not_mask32x16(self, a: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) } #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) } #[inline(always)] fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) } #[inline(always)] fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) } #[inline(always)] fn select_mask32x16( self, a: mask32x16, b: mask32x16, c: mask32x16, ) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); let (c0, c1) = self.split_mask32x16(c); self.combine_mask32x8( self.select_mask32x8(a0, b0, c0), self.select_mask32x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) } #[inline(always)] fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x8(self, a: f64) -> f64x8 { let half = self.splat_f64x4(a); self.combine_f64x4(half, half) } #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) } #[inline(always)] fn neg_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) } #[inline(always)] fn sqrt_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) } #[inline(always)] fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) } #[inline(always)] fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) } #[inline(always)] fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) } #[inline(always)] fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) } #[inline(always)] fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) } #[inline(always)] fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) } #[inline(always)] fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) } #[inline(always)] fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) } #[inline(always)] fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) } #[inline(always)] fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, _) = self.split_f64x8(a); let (b0, _) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) } #[inline(always)] fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (_, a1) = self.split_f64x8(a); let (_, b1) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) } #[inline(always)] fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) } #[inline(always)] fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) } #[inline(always)] fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) } #[inline(always)] fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.max_precise_f64x4(a0, b0), self.max_precise_f64x4(a1, b1), ) } #[inline(always)] fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) } #[inline(always)] fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.min_precise_f64x4(a0, b0), self.min_precise_f64x4(a1, b1), ) } #[inline(always)] fn madd_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1)) } #[inline(always)] fn msub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1)) } #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) } #[inline(always)] fn fract_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) } #[inline(always)] fn trunc_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) } #[inline(always)] fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) } #[inline(always)] fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( self.reinterpret_f32_f64x4(a0), self.reinterpret_f32_f64x4(a1), ) } #[inline(always)] fn splat_mask64x8(self, a: i64) -> mask64x8 { let half = self.splat_mask64x4(a); self.combine_mask64x4(half, half) } #[inline(always)] fn not_mask64x8(self, a: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1)) } #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1)) } #[inline(always)] fn or_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1)) } #[inline(always)] fn xor_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1)) } #[inline(always)] fn select_mask64x8( self, a: mask64x8, b: mask64x8, c: mask64x8, ) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); let (c0, c1) = self.split_mask64x8(c); self.combine_mask64x4( self.select_mask64x4(a0, b0, c0), self.select_mask64x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1)) } #[inline(always)] fn split_mask64x8(self, a: mask64x8) -> (mask64x4, mask64x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } } impl SimdFrom<__m256, S> for f32x8 { #[inline(always)] fn simd_from(arch: __m256, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256 { #[inline(always)] fn from(value: f32x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for i8x32 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: i8x32) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for u8x32 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: u8x32) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for mask8x32 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: mask8x32) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for i16x16 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: i16x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for u16x16 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: u16x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for mask16x16 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: mask16x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for i32x8 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: i32x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for u32x8 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: u32x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for mask32x8 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: mask32x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256d, S> for f64x4 { #[inline(always)] fn simd_from(arch: __m256d, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256d { #[inline(always)] fn from(value: f64x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m256i, S> for mask64x4 { #[inline(always)] fn simd_from(arch: __m256i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m256i { #[inline(always)] fn from(value: mask64x4) -> Self { unsafe { core::mem::transmute(value.val) } } } fearless_simd-0.3.0/src/generated/fallback.rs000064400000000000000000007744311046102023000173160ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT // This file is autogenerated by fearless_simd_gen use crate::{Level, Simd, SimdInto, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, }; use core::ops::*; #[cfg(all(feature = "libm", not(feature = "std")))] trait FloatExt { fn floor(self) -> Self; fn fract(self) -> Self; fn sqrt(self) -> Self; fn trunc(self) -> Self; } #[cfg(all(feature = "libm", not(feature = "std")))] impl FloatExt for f32 { #[inline(always)] fn floor(self) -> f32 { libm::floorf(self) } #[inline(always)] fn sqrt(self) -> f32 { libm::sqrtf(self) } #[inline(always)] fn fract(self) -> f32 { self - self.trunc() } #[inline(always)] fn trunc(self) -> f32 { libm::truncf(self) } } #[cfg(all(feature = "libm", not(feature = "std")))] impl FloatExt for f64 { #[inline(always)] fn floor(self) -> f64 { libm::floor(self) } #[inline(always)] fn sqrt(self) -> f64 { libm::sqrt(self) } #[inline(always)] fn fract(self) -> f64 { self - self.trunc() } #[inline(always)] fn trunc(self) -> f64 { libm::trunc(self) } } #[doc = r#" The SIMD token for the "fallback" level."#] #[derive(Clone, Copy, Debug)] pub struct Fallback { pub fallback: crate::core_arch::fallback::Fallback, } impl Fallback { #[inline] pub const fn new() -> Self { Fallback { fallback: crate::core_arch::fallback::Fallback::new(), } } } impl Seal for Fallback {} impl Simd for Fallback { type f32s = f32x4; type u8s = u8x16; type i8s = i8x16; type u16s = u16x8; type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; #[inline(always)] fn level(self) -> Level { Level::Fallback(self) } #[inline] fn vectorize R, R>(self, f: F) -> R { f() } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { [val; 4usize].simd_into(self) } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { [ f32::abs(a[0usize]), f32::abs(a[1usize]), f32::abs(a[2usize]), f32::abs(a[3usize]), ] .simd_into(self) } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { [ f32::neg(a[0usize]), f32::neg(a[1usize]), f32::neg(a[2usize]), f32::neg(a[3usize]), ] .simd_into(self) } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { [ f32::sqrt(a[0usize]), f32::sqrt(a[1usize]), f32::sqrt(a[2usize]), f32::sqrt(a[3usize]), ] .simd_into(self) } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::add(a[0usize], &b[0usize]), f32::add(a[1usize], &b[1usize]), f32::add(a[2usize], &b[2usize]), f32::add(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::sub(a[0usize], &b[0usize]), f32::sub(a[1usize], &b[1usize]), f32::sub(a[2usize], &b[2usize]), f32::sub(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::mul(a[0usize], &b[0usize]), f32::mul(a[1usize], &b[1usize]), f32::mul(a[2usize], &b[2usize]), f32::mul(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::div(a[0usize], &b[0usize]), f32::div(a[1usize], &b[1usize]), f32::div(a[2usize], &b[2usize]), f32::div(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::copysign(a[0usize], b[0usize]), f32::copysign(a[1usize], b[1usize]), f32::copysign(a[2usize], b[2usize]), f32::copysign(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { [ -(f32::eq(&a[0usize], &b[0usize]) as i32), -(f32::eq(&a[1usize], &b[1usize]) as i32), -(f32::eq(&a[2usize], &b[2usize]) as i32), -(f32::eq(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { [ -(f32::lt(&a[0usize], &b[0usize]) as i32), -(f32::lt(&a[1usize], &b[1usize]) as i32), -(f32::lt(&a[2usize], &b[2usize]) as i32), -(f32::lt(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { [ -(f32::le(&a[0usize], &b[0usize]) as i32), -(f32::le(&a[1usize], &b[1usize]) as i32), -(f32::le(&a[2usize], &b[2usize]) as i32), -(f32::le(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { [ -(f32::ge(&a[0usize], &b[0usize]) as i32), -(f32::ge(&a[1usize], &b[1usize]) as i32), -(f32::ge(&a[2usize], &b[2usize]) as i32), -(f32::ge(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { [ -(f32::gt(&a[0usize], &b[0usize]) as i32), -(f32::gt(&a[1usize], &b[1usize]) as i32), -(f32::gt(&a[2usize], &b[2usize]) as i32), -(f32::gt(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [a[0usize], b[0usize], a[1usize], b[1usize]].simd_into(self) } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [a[2usize], b[2usize], a[3usize], b[3usize]].simd_into(self) } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [a[0usize], a[2usize], b[0usize], b[2usize]].simd_into(self) } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [a[1usize], a[3usize], b[1usize], b[3usize]].simd_into(self) } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::max(a[0usize], b[0usize]), f32::max(a[1usize], b[1usize]), f32::max(a[2usize], b[2usize]), f32::max(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::max(a[0usize], b[0usize]), f32::max(a[1usize], b[1usize]), f32::max(a[2usize], b[2usize]), f32::max(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::min(a[0usize], b[0usize]), f32::min(a[1usize], b[1usize]), f32::min(a[2usize], b[2usize]), f32::min(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { [ f32::min(a[0usize], b[0usize]), f32::min(a[1usize], b[1usize]), f32::min(a[2usize], b[2usize]), f32::min(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn madd_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { a.mul(b).add(c) } #[inline(always)] fn msub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { a.mul(b).sub(c) } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { [ f32::floor(a[0usize]), f32::floor(a[1usize]), f32::floor(a[2usize]), f32::floor(a[3usize]), ] .simd_into(self) } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { [ f32::fract(a[0usize]), f32::fract(a[1usize]), f32::fract(a[2usize]), f32::fract(a[3usize]), ] .simd_into(self) } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { [ f32::trunc(a[0usize]), f32::trunc(a[1usize]), f32::trunc(a[2usize]), f32::trunc(a[3usize]), ] .simd_into(self) } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, ] .simd_into(self) } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { f64x2 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { i32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { [ a[0usize] as u32, a[1usize] as u32, a[2usize] as u32, a[3usize] as u32, ] .simd_into(self) } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { [ a[0usize] as i32, a[1usize] as i32, a[2usize] as i32, a[3usize] as i32, ] .simd_into(self) } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { [val; 16usize].simd_into(self) } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { [ i8::not(a[0usize]), i8::not(a[1usize]), i8::not(a[2usize]), i8::not(a[3usize]), i8::not(a[4usize]), i8::not(a[5usize]), i8::not(a[6usize]), i8::not(a[7usize]), i8::not(a[8usize]), i8::not(a[9usize]), i8::not(a[10usize]), i8::not(a[11usize]), i8::not(a[12usize]), i8::not(a[13usize]), i8::not(a[14usize]), i8::not(a[15usize]), ] .simd_into(self) } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::wrapping_add(a[0usize], b[0usize]), i8::wrapping_add(a[1usize], b[1usize]), i8::wrapping_add(a[2usize], b[2usize]), i8::wrapping_add(a[3usize], b[3usize]), i8::wrapping_add(a[4usize], b[4usize]), i8::wrapping_add(a[5usize], b[5usize]), i8::wrapping_add(a[6usize], b[6usize]), i8::wrapping_add(a[7usize], b[7usize]), i8::wrapping_add(a[8usize], b[8usize]), i8::wrapping_add(a[9usize], b[9usize]), i8::wrapping_add(a[10usize], b[10usize]), i8::wrapping_add(a[11usize], b[11usize]), i8::wrapping_add(a[12usize], b[12usize]), i8::wrapping_add(a[13usize], b[13usize]), i8::wrapping_add(a[14usize], b[14usize]), i8::wrapping_add(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::wrapping_sub(a[0usize], b[0usize]), i8::wrapping_sub(a[1usize], b[1usize]), i8::wrapping_sub(a[2usize], b[2usize]), i8::wrapping_sub(a[3usize], b[3usize]), i8::wrapping_sub(a[4usize], b[4usize]), i8::wrapping_sub(a[5usize], b[5usize]), i8::wrapping_sub(a[6usize], b[6usize]), i8::wrapping_sub(a[7usize], b[7usize]), i8::wrapping_sub(a[8usize], b[8usize]), i8::wrapping_sub(a[9usize], b[9usize]), i8::wrapping_sub(a[10usize], b[10usize]), i8::wrapping_sub(a[11usize], b[11usize]), i8::wrapping_sub(a[12usize], b[12usize]), i8::wrapping_sub(a[13usize], b[13usize]), i8::wrapping_sub(a[14usize], b[14usize]), i8::wrapping_sub(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::wrapping_mul(a[0usize], b[0usize]), i8::wrapping_mul(a[1usize], b[1usize]), i8::wrapping_mul(a[2usize], b[2usize]), i8::wrapping_mul(a[3usize], b[3usize]), i8::wrapping_mul(a[4usize], b[4usize]), i8::wrapping_mul(a[5usize], b[5usize]), i8::wrapping_mul(a[6usize], b[6usize]), i8::wrapping_mul(a[7usize], b[7usize]), i8::wrapping_mul(a[8usize], b[8usize]), i8::wrapping_mul(a[9usize], b[9usize]), i8::wrapping_mul(a[10usize], b[10usize]), i8::wrapping_mul(a[11usize], b[11usize]), i8::wrapping_mul(a[12usize], b[12usize]), i8::wrapping_mul(a[13usize], b[13usize]), i8::wrapping_mul(a[14usize], b[14usize]), i8::wrapping_mul(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::bitand(a[0usize], &b[0usize]), i8::bitand(a[1usize], &b[1usize]), i8::bitand(a[2usize], &b[2usize]), i8::bitand(a[3usize], &b[3usize]), i8::bitand(a[4usize], &b[4usize]), i8::bitand(a[5usize], &b[5usize]), i8::bitand(a[6usize], &b[6usize]), i8::bitand(a[7usize], &b[7usize]), i8::bitand(a[8usize], &b[8usize]), i8::bitand(a[9usize], &b[9usize]), i8::bitand(a[10usize], &b[10usize]), i8::bitand(a[11usize], &b[11usize]), i8::bitand(a[12usize], &b[12usize]), i8::bitand(a[13usize], &b[13usize]), i8::bitand(a[14usize], &b[14usize]), i8::bitand(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::bitor(a[0usize], &b[0usize]), i8::bitor(a[1usize], &b[1usize]), i8::bitor(a[2usize], &b[2usize]), i8::bitor(a[3usize], &b[3usize]), i8::bitor(a[4usize], &b[4usize]), i8::bitor(a[5usize], &b[5usize]), i8::bitor(a[6usize], &b[6usize]), i8::bitor(a[7usize], &b[7usize]), i8::bitor(a[8usize], &b[8usize]), i8::bitor(a[9usize], &b[9usize]), i8::bitor(a[10usize], &b[10usize]), i8::bitor(a[11usize], &b[11usize]), i8::bitor(a[12usize], &b[12usize]), i8::bitor(a[13usize], &b[13usize]), i8::bitor(a[14usize], &b[14usize]), i8::bitor(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::bitxor(a[0usize], &b[0usize]), i8::bitxor(a[1usize], &b[1usize]), i8::bitxor(a[2usize], &b[2usize]), i8::bitxor(a[3usize], &b[3usize]), i8::bitxor(a[4usize], &b[4usize]), i8::bitxor(a[5usize], &b[5usize]), i8::bitxor(a[6usize], &b[6usize]), i8::bitxor(a[7usize], &b[7usize]), i8::bitxor(a[8usize], &b[8usize]), i8::bitxor(a[9usize], &b[9usize]), i8::bitxor(a[10usize], &b[10usize]), i8::bitxor(a[11usize], &b[11usize]), i8::bitxor(a[12usize], &b[12usize]), i8::bitxor(a[13usize], &b[13usize]), i8::bitxor(a[14usize], &b[14usize]), i8::bitxor(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { [ i8::shr(a[0usize], shift as i8), i8::shr(a[1usize], shift as i8), i8::shr(a[2usize], shift as i8), i8::shr(a[3usize], shift as i8), i8::shr(a[4usize], shift as i8), i8::shr(a[5usize], shift as i8), i8::shr(a[6usize], shift as i8), i8::shr(a[7usize], shift as i8), i8::shr(a[8usize], shift as i8), i8::shr(a[9usize], shift as i8), i8::shr(a[10usize], shift as i8), i8::shr(a[11usize], shift as i8), i8::shr(a[12usize], shift as i8), i8::shr(a[13usize], shift as i8), i8::shr(a[14usize], shift as i8), i8::shr(a[15usize], shift as i8), ] .simd_into(self) } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::shr(a[0usize], &b[0usize]), i8::shr(a[1usize], &b[1usize]), i8::shr(a[2usize], &b[2usize]), i8::shr(a[3usize], &b[3usize]), i8::shr(a[4usize], &b[4usize]), i8::shr(a[5usize], &b[5usize]), i8::shr(a[6usize], &b[6usize]), i8::shr(a[7usize], &b[7usize]), i8::shr(a[8usize], &b[8usize]), i8::shr(a[9usize], &b[9usize]), i8::shr(a[10usize], &b[10usize]), i8::shr(a[11usize], &b[11usize]), i8::shr(a[12usize], &b[12usize]), i8::shr(a[13usize], &b[13usize]), i8::shr(a[14usize], &b[14usize]), i8::shr(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { [ i8::shl(a[0usize], shift as i8), i8::shl(a[1usize], shift as i8), i8::shl(a[2usize], shift as i8), i8::shl(a[3usize], shift as i8), i8::shl(a[4usize], shift as i8), i8::shl(a[5usize], shift as i8), i8::shl(a[6usize], shift as i8), i8::shl(a[7usize], shift as i8), i8::shl(a[8usize], shift as i8), i8::shl(a[9usize], shift as i8), i8::shl(a[10usize], shift as i8), i8::shl(a[11usize], shift as i8), i8::shl(a[12usize], shift as i8), i8::shl(a[13usize], shift as i8), i8::shl(a[14usize], shift as i8), i8::shl(a[15usize], shift as i8), ] .simd_into(self) } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { [ -(i8::eq(&a[0usize], &b[0usize]) as i8), -(i8::eq(&a[1usize], &b[1usize]) as i8), -(i8::eq(&a[2usize], &b[2usize]) as i8), -(i8::eq(&a[3usize], &b[3usize]) as i8), -(i8::eq(&a[4usize], &b[4usize]) as i8), -(i8::eq(&a[5usize], &b[5usize]) as i8), -(i8::eq(&a[6usize], &b[6usize]) as i8), -(i8::eq(&a[7usize], &b[7usize]) as i8), -(i8::eq(&a[8usize], &b[8usize]) as i8), -(i8::eq(&a[9usize], &b[9usize]) as i8), -(i8::eq(&a[10usize], &b[10usize]) as i8), -(i8::eq(&a[11usize], &b[11usize]) as i8), -(i8::eq(&a[12usize], &b[12usize]) as i8), -(i8::eq(&a[13usize], &b[13usize]) as i8), -(i8::eq(&a[14usize], &b[14usize]) as i8), -(i8::eq(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { [ -(i8::lt(&a[0usize], &b[0usize]) as i8), -(i8::lt(&a[1usize], &b[1usize]) as i8), -(i8::lt(&a[2usize], &b[2usize]) as i8), -(i8::lt(&a[3usize], &b[3usize]) as i8), -(i8::lt(&a[4usize], &b[4usize]) as i8), -(i8::lt(&a[5usize], &b[5usize]) as i8), -(i8::lt(&a[6usize], &b[6usize]) as i8), -(i8::lt(&a[7usize], &b[7usize]) as i8), -(i8::lt(&a[8usize], &b[8usize]) as i8), -(i8::lt(&a[9usize], &b[9usize]) as i8), -(i8::lt(&a[10usize], &b[10usize]) as i8), -(i8::lt(&a[11usize], &b[11usize]) as i8), -(i8::lt(&a[12usize], &b[12usize]) as i8), -(i8::lt(&a[13usize], &b[13usize]) as i8), -(i8::lt(&a[14usize], &b[14usize]) as i8), -(i8::lt(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { [ -(i8::le(&a[0usize], &b[0usize]) as i8), -(i8::le(&a[1usize], &b[1usize]) as i8), -(i8::le(&a[2usize], &b[2usize]) as i8), -(i8::le(&a[3usize], &b[3usize]) as i8), -(i8::le(&a[4usize], &b[4usize]) as i8), -(i8::le(&a[5usize], &b[5usize]) as i8), -(i8::le(&a[6usize], &b[6usize]) as i8), -(i8::le(&a[7usize], &b[7usize]) as i8), -(i8::le(&a[8usize], &b[8usize]) as i8), -(i8::le(&a[9usize], &b[9usize]) as i8), -(i8::le(&a[10usize], &b[10usize]) as i8), -(i8::le(&a[11usize], &b[11usize]) as i8), -(i8::le(&a[12usize], &b[12usize]) as i8), -(i8::le(&a[13usize], &b[13usize]) as i8), -(i8::le(&a[14usize], &b[14usize]) as i8), -(i8::le(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { [ -(i8::ge(&a[0usize], &b[0usize]) as i8), -(i8::ge(&a[1usize], &b[1usize]) as i8), -(i8::ge(&a[2usize], &b[2usize]) as i8), -(i8::ge(&a[3usize], &b[3usize]) as i8), -(i8::ge(&a[4usize], &b[4usize]) as i8), -(i8::ge(&a[5usize], &b[5usize]) as i8), -(i8::ge(&a[6usize], &b[6usize]) as i8), -(i8::ge(&a[7usize], &b[7usize]) as i8), -(i8::ge(&a[8usize], &b[8usize]) as i8), -(i8::ge(&a[9usize], &b[9usize]) as i8), -(i8::ge(&a[10usize], &b[10usize]) as i8), -(i8::ge(&a[11usize], &b[11usize]) as i8), -(i8::ge(&a[12usize], &b[12usize]) as i8), -(i8::ge(&a[13usize], &b[13usize]) as i8), -(i8::ge(&a[14usize], &b[14usize]) as i8), -(i8::ge(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { [ -(i8::gt(&a[0usize], &b[0usize]) as i8), -(i8::gt(&a[1usize], &b[1usize]) as i8), -(i8::gt(&a[2usize], &b[2usize]) as i8), -(i8::gt(&a[3usize], &b[3usize]) as i8), -(i8::gt(&a[4usize], &b[4usize]) as i8), -(i8::gt(&a[5usize], &b[5usize]) as i8), -(i8::gt(&a[6usize], &b[6usize]) as i8), -(i8::gt(&a[7usize], &b[7usize]) as i8), -(i8::gt(&a[8usize], &b[8usize]) as i8), -(i8::gt(&a[9usize], &b[9usize]) as i8), -(i8::gt(&a[10usize], &b[10usize]) as i8), -(i8::gt(&a[11usize], &b[11usize]) as i8), -(i8::gt(&a[12usize], &b[12usize]) as i8), -(i8::gt(&a[13usize], &b[13usize]) as i8), -(i8::gt(&a[14usize], &b[14usize]) as i8), -(i8::gt(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ a[0usize], b[0usize], a[1usize], b[1usize], a[2usize], b[2usize], a[3usize], b[3usize], a[4usize], b[4usize], a[5usize], b[5usize], a[6usize], b[6usize], a[7usize], b[7usize], ] .simd_into(self) } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ a[8usize], b[8usize], a[9usize], b[9usize], a[10usize], b[10usize], a[11usize], b[11usize], a[12usize], b[12usize], a[13usize], b[13usize], a[14usize], b[14usize], a[15usize], b[15usize], ] .simd_into(self) } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ a[0usize], a[2usize], a[4usize], a[6usize], a[8usize], a[10usize], a[12usize], a[14usize], b[0usize], b[2usize], b[4usize], b[6usize], b[8usize], b[10usize], b[12usize], b[14usize], ] .simd_into(self) } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ a[1usize], a[3usize], a[5usize], a[7usize], a[9usize], a[11usize], a[13usize], a[15usize], b[1usize], b[3usize], b[5usize], b[7usize], b[9usize], b[11usize], b[13usize], b[15usize], ] .simd_into(self) } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, if a[4usize] != 0 { b[4usize] } else { c[4usize] }, if a[5usize] != 0 { b[5usize] } else { c[5usize] }, if a[6usize] != 0 { b[6usize] } else { c[6usize] }, if a[7usize] != 0 { b[7usize] } else { c[7usize] }, if a[8usize] != 0 { b[8usize] } else { c[8usize] }, if a[9usize] != 0 { b[9usize] } else { c[9usize] }, if a[10usize] != 0 { b[10usize] } else { c[10usize] }, if a[11usize] != 0 { b[11usize] } else { c[11usize] }, if a[12usize] != 0 { b[12usize] } else { c[12usize] }, if a[13usize] != 0 { b[13usize] } else { c[13usize] }, if a[14usize] != 0 { b[14usize] } else { c[14usize] }, if a[15usize] != 0 { b[15usize] } else { c[15usize] }, ] .simd_into(self) } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::min(a[0usize], b[0usize]), i8::min(a[1usize], b[1usize]), i8::min(a[2usize], b[2usize]), i8::min(a[3usize], b[3usize]), i8::min(a[4usize], b[4usize]), i8::min(a[5usize], b[5usize]), i8::min(a[6usize], b[6usize]), i8::min(a[7usize], b[7usize]), i8::min(a[8usize], b[8usize]), i8::min(a[9usize], b[9usize]), i8::min(a[10usize], b[10usize]), i8::min(a[11usize], b[11usize]), i8::min(a[12usize], b[12usize]), i8::min(a[13usize], b[13usize]), i8::min(a[14usize], b[14usize]), i8::min(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::max(a[0usize], b[0usize]), i8::max(a[1usize], b[1usize]), i8::max(a[2usize], b[2usize]), i8::max(a[3usize], b[3usize]), i8::max(a[4usize], b[4usize]), i8::max(a[5usize], b[5usize]), i8::max(a[6usize], b[6usize]), i8::max(a[7usize], b[7usize]), i8::max(a[8usize], b[8usize]), i8::max(a[9usize], b[9usize]), i8::max(a[10usize], b[10usize]), i8::max(a[11usize], b[11usize]), i8::max(a[12usize], b[12usize]), i8::max(a[13usize], b[13usize]), i8::max(a[14usize], b[14usize]), i8::max(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { [ i8::neg(a[0usize]), i8::neg(a[1usize]), i8::neg(a[2usize]), i8::neg(a[3usize]), i8::neg(a[4usize]), i8::neg(a[5usize]), i8::neg(a[6usize]), i8::neg(a[7usize]), i8::neg(a[8usize]), i8::neg(a[9usize]), i8::neg(a[10usize]), i8::neg(a[11usize]), i8::neg(a[12usize]), i8::neg(a[13usize]), i8::neg(a[14usize]), i8::neg(a[15usize]), ] .simd_into(self) } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { [val; 16usize].simd_into(self) } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { [ u8::not(a[0usize]), u8::not(a[1usize]), u8::not(a[2usize]), u8::not(a[3usize]), u8::not(a[4usize]), u8::not(a[5usize]), u8::not(a[6usize]), u8::not(a[7usize]), u8::not(a[8usize]), u8::not(a[9usize]), u8::not(a[10usize]), u8::not(a[11usize]), u8::not(a[12usize]), u8::not(a[13usize]), u8::not(a[14usize]), u8::not(a[15usize]), ] .simd_into(self) } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::wrapping_add(a[0usize], b[0usize]), u8::wrapping_add(a[1usize], b[1usize]), u8::wrapping_add(a[2usize], b[2usize]), u8::wrapping_add(a[3usize], b[3usize]), u8::wrapping_add(a[4usize], b[4usize]), u8::wrapping_add(a[5usize], b[5usize]), u8::wrapping_add(a[6usize], b[6usize]), u8::wrapping_add(a[7usize], b[7usize]), u8::wrapping_add(a[8usize], b[8usize]), u8::wrapping_add(a[9usize], b[9usize]), u8::wrapping_add(a[10usize], b[10usize]), u8::wrapping_add(a[11usize], b[11usize]), u8::wrapping_add(a[12usize], b[12usize]), u8::wrapping_add(a[13usize], b[13usize]), u8::wrapping_add(a[14usize], b[14usize]), u8::wrapping_add(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::wrapping_sub(a[0usize], b[0usize]), u8::wrapping_sub(a[1usize], b[1usize]), u8::wrapping_sub(a[2usize], b[2usize]), u8::wrapping_sub(a[3usize], b[3usize]), u8::wrapping_sub(a[4usize], b[4usize]), u8::wrapping_sub(a[5usize], b[5usize]), u8::wrapping_sub(a[6usize], b[6usize]), u8::wrapping_sub(a[7usize], b[7usize]), u8::wrapping_sub(a[8usize], b[8usize]), u8::wrapping_sub(a[9usize], b[9usize]), u8::wrapping_sub(a[10usize], b[10usize]), u8::wrapping_sub(a[11usize], b[11usize]), u8::wrapping_sub(a[12usize], b[12usize]), u8::wrapping_sub(a[13usize], b[13usize]), u8::wrapping_sub(a[14usize], b[14usize]), u8::wrapping_sub(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::wrapping_mul(a[0usize], b[0usize]), u8::wrapping_mul(a[1usize], b[1usize]), u8::wrapping_mul(a[2usize], b[2usize]), u8::wrapping_mul(a[3usize], b[3usize]), u8::wrapping_mul(a[4usize], b[4usize]), u8::wrapping_mul(a[5usize], b[5usize]), u8::wrapping_mul(a[6usize], b[6usize]), u8::wrapping_mul(a[7usize], b[7usize]), u8::wrapping_mul(a[8usize], b[8usize]), u8::wrapping_mul(a[9usize], b[9usize]), u8::wrapping_mul(a[10usize], b[10usize]), u8::wrapping_mul(a[11usize], b[11usize]), u8::wrapping_mul(a[12usize], b[12usize]), u8::wrapping_mul(a[13usize], b[13usize]), u8::wrapping_mul(a[14usize], b[14usize]), u8::wrapping_mul(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::bitand(a[0usize], &b[0usize]), u8::bitand(a[1usize], &b[1usize]), u8::bitand(a[2usize], &b[2usize]), u8::bitand(a[3usize], &b[3usize]), u8::bitand(a[4usize], &b[4usize]), u8::bitand(a[5usize], &b[5usize]), u8::bitand(a[6usize], &b[6usize]), u8::bitand(a[7usize], &b[7usize]), u8::bitand(a[8usize], &b[8usize]), u8::bitand(a[9usize], &b[9usize]), u8::bitand(a[10usize], &b[10usize]), u8::bitand(a[11usize], &b[11usize]), u8::bitand(a[12usize], &b[12usize]), u8::bitand(a[13usize], &b[13usize]), u8::bitand(a[14usize], &b[14usize]), u8::bitand(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::bitor(a[0usize], &b[0usize]), u8::bitor(a[1usize], &b[1usize]), u8::bitor(a[2usize], &b[2usize]), u8::bitor(a[3usize], &b[3usize]), u8::bitor(a[4usize], &b[4usize]), u8::bitor(a[5usize], &b[5usize]), u8::bitor(a[6usize], &b[6usize]), u8::bitor(a[7usize], &b[7usize]), u8::bitor(a[8usize], &b[8usize]), u8::bitor(a[9usize], &b[9usize]), u8::bitor(a[10usize], &b[10usize]), u8::bitor(a[11usize], &b[11usize]), u8::bitor(a[12usize], &b[12usize]), u8::bitor(a[13usize], &b[13usize]), u8::bitor(a[14usize], &b[14usize]), u8::bitor(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::bitxor(a[0usize], &b[0usize]), u8::bitxor(a[1usize], &b[1usize]), u8::bitxor(a[2usize], &b[2usize]), u8::bitxor(a[3usize], &b[3usize]), u8::bitxor(a[4usize], &b[4usize]), u8::bitxor(a[5usize], &b[5usize]), u8::bitxor(a[6usize], &b[6usize]), u8::bitxor(a[7usize], &b[7usize]), u8::bitxor(a[8usize], &b[8usize]), u8::bitxor(a[9usize], &b[9usize]), u8::bitxor(a[10usize], &b[10usize]), u8::bitxor(a[11usize], &b[11usize]), u8::bitxor(a[12usize], &b[12usize]), u8::bitxor(a[13usize], &b[13usize]), u8::bitxor(a[14usize], &b[14usize]), u8::bitxor(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { [ u8::shr(a[0usize], shift as u8), u8::shr(a[1usize], shift as u8), u8::shr(a[2usize], shift as u8), u8::shr(a[3usize], shift as u8), u8::shr(a[4usize], shift as u8), u8::shr(a[5usize], shift as u8), u8::shr(a[6usize], shift as u8), u8::shr(a[7usize], shift as u8), u8::shr(a[8usize], shift as u8), u8::shr(a[9usize], shift as u8), u8::shr(a[10usize], shift as u8), u8::shr(a[11usize], shift as u8), u8::shr(a[12usize], shift as u8), u8::shr(a[13usize], shift as u8), u8::shr(a[14usize], shift as u8), u8::shr(a[15usize], shift as u8), ] .simd_into(self) } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::shr(a[0usize], &b[0usize]), u8::shr(a[1usize], &b[1usize]), u8::shr(a[2usize], &b[2usize]), u8::shr(a[3usize], &b[3usize]), u8::shr(a[4usize], &b[4usize]), u8::shr(a[5usize], &b[5usize]), u8::shr(a[6usize], &b[6usize]), u8::shr(a[7usize], &b[7usize]), u8::shr(a[8usize], &b[8usize]), u8::shr(a[9usize], &b[9usize]), u8::shr(a[10usize], &b[10usize]), u8::shr(a[11usize], &b[11usize]), u8::shr(a[12usize], &b[12usize]), u8::shr(a[13usize], &b[13usize]), u8::shr(a[14usize], &b[14usize]), u8::shr(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { [ u8::shl(a[0usize], shift as u8), u8::shl(a[1usize], shift as u8), u8::shl(a[2usize], shift as u8), u8::shl(a[3usize], shift as u8), u8::shl(a[4usize], shift as u8), u8::shl(a[5usize], shift as u8), u8::shl(a[6usize], shift as u8), u8::shl(a[7usize], shift as u8), u8::shl(a[8usize], shift as u8), u8::shl(a[9usize], shift as u8), u8::shl(a[10usize], shift as u8), u8::shl(a[11usize], shift as u8), u8::shl(a[12usize], shift as u8), u8::shl(a[13usize], shift as u8), u8::shl(a[14usize], shift as u8), u8::shl(a[15usize], shift as u8), ] .simd_into(self) } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { [ -(u8::eq(&a[0usize], &b[0usize]) as i8), -(u8::eq(&a[1usize], &b[1usize]) as i8), -(u8::eq(&a[2usize], &b[2usize]) as i8), -(u8::eq(&a[3usize], &b[3usize]) as i8), -(u8::eq(&a[4usize], &b[4usize]) as i8), -(u8::eq(&a[5usize], &b[5usize]) as i8), -(u8::eq(&a[6usize], &b[6usize]) as i8), -(u8::eq(&a[7usize], &b[7usize]) as i8), -(u8::eq(&a[8usize], &b[8usize]) as i8), -(u8::eq(&a[9usize], &b[9usize]) as i8), -(u8::eq(&a[10usize], &b[10usize]) as i8), -(u8::eq(&a[11usize], &b[11usize]) as i8), -(u8::eq(&a[12usize], &b[12usize]) as i8), -(u8::eq(&a[13usize], &b[13usize]) as i8), -(u8::eq(&a[14usize], &b[14usize]) as i8), -(u8::eq(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { [ -(u8::lt(&a[0usize], &b[0usize]) as i8), -(u8::lt(&a[1usize], &b[1usize]) as i8), -(u8::lt(&a[2usize], &b[2usize]) as i8), -(u8::lt(&a[3usize], &b[3usize]) as i8), -(u8::lt(&a[4usize], &b[4usize]) as i8), -(u8::lt(&a[5usize], &b[5usize]) as i8), -(u8::lt(&a[6usize], &b[6usize]) as i8), -(u8::lt(&a[7usize], &b[7usize]) as i8), -(u8::lt(&a[8usize], &b[8usize]) as i8), -(u8::lt(&a[9usize], &b[9usize]) as i8), -(u8::lt(&a[10usize], &b[10usize]) as i8), -(u8::lt(&a[11usize], &b[11usize]) as i8), -(u8::lt(&a[12usize], &b[12usize]) as i8), -(u8::lt(&a[13usize], &b[13usize]) as i8), -(u8::lt(&a[14usize], &b[14usize]) as i8), -(u8::lt(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { [ -(u8::le(&a[0usize], &b[0usize]) as i8), -(u8::le(&a[1usize], &b[1usize]) as i8), -(u8::le(&a[2usize], &b[2usize]) as i8), -(u8::le(&a[3usize], &b[3usize]) as i8), -(u8::le(&a[4usize], &b[4usize]) as i8), -(u8::le(&a[5usize], &b[5usize]) as i8), -(u8::le(&a[6usize], &b[6usize]) as i8), -(u8::le(&a[7usize], &b[7usize]) as i8), -(u8::le(&a[8usize], &b[8usize]) as i8), -(u8::le(&a[9usize], &b[9usize]) as i8), -(u8::le(&a[10usize], &b[10usize]) as i8), -(u8::le(&a[11usize], &b[11usize]) as i8), -(u8::le(&a[12usize], &b[12usize]) as i8), -(u8::le(&a[13usize], &b[13usize]) as i8), -(u8::le(&a[14usize], &b[14usize]) as i8), -(u8::le(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { [ -(u8::ge(&a[0usize], &b[0usize]) as i8), -(u8::ge(&a[1usize], &b[1usize]) as i8), -(u8::ge(&a[2usize], &b[2usize]) as i8), -(u8::ge(&a[3usize], &b[3usize]) as i8), -(u8::ge(&a[4usize], &b[4usize]) as i8), -(u8::ge(&a[5usize], &b[5usize]) as i8), -(u8::ge(&a[6usize], &b[6usize]) as i8), -(u8::ge(&a[7usize], &b[7usize]) as i8), -(u8::ge(&a[8usize], &b[8usize]) as i8), -(u8::ge(&a[9usize], &b[9usize]) as i8), -(u8::ge(&a[10usize], &b[10usize]) as i8), -(u8::ge(&a[11usize], &b[11usize]) as i8), -(u8::ge(&a[12usize], &b[12usize]) as i8), -(u8::ge(&a[13usize], &b[13usize]) as i8), -(u8::ge(&a[14usize], &b[14usize]) as i8), -(u8::ge(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { [ -(u8::gt(&a[0usize], &b[0usize]) as i8), -(u8::gt(&a[1usize], &b[1usize]) as i8), -(u8::gt(&a[2usize], &b[2usize]) as i8), -(u8::gt(&a[3usize], &b[3usize]) as i8), -(u8::gt(&a[4usize], &b[4usize]) as i8), -(u8::gt(&a[5usize], &b[5usize]) as i8), -(u8::gt(&a[6usize], &b[6usize]) as i8), -(u8::gt(&a[7usize], &b[7usize]) as i8), -(u8::gt(&a[8usize], &b[8usize]) as i8), -(u8::gt(&a[9usize], &b[9usize]) as i8), -(u8::gt(&a[10usize], &b[10usize]) as i8), -(u8::gt(&a[11usize], &b[11usize]) as i8), -(u8::gt(&a[12usize], &b[12usize]) as i8), -(u8::gt(&a[13usize], &b[13usize]) as i8), -(u8::gt(&a[14usize], &b[14usize]) as i8), -(u8::gt(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ a[0usize], b[0usize], a[1usize], b[1usize], a[2usize], b[2usize], a[3usize], b[3usize], a[4usize], b[4usize], a[5usize], b[5usize], a[6usize], b[6usize], a[7usize], b[7usize], ] .simd_into(self) } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ a[8usize], b[8usize], a[9usize], b[9usize], a[10usize], b[10usize], a[11usize], b[11usize], a[12usize], b[12usize], a[13usize], b[13usize], a[14usize], b[14usize], a[15usize], b[15usize], ] .simd_into(self) } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ a[0usize], a[2usize], a[4usize], a[6usize], a[8usize], a[10usize], a[12usize], a[14usize], b[0usize], b[2usize], b[4usize], b[6usize], b[8usize], b[10usize], b[12usize], b[14usize], ] .simd_into(self) } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ a[1usize], a[3usize], a[5usize], a[7usize], a[9usize], a[11usize], a[13usize], a[15usize], b[1usize], b[3usize], b[5usize], b[7usize], b[9usize], b[11usize], b[13usize], b[15usize], ] .simd_into(self) } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, if a[4usize] != 0 { b[4usize] } else { c[4usize] }, if a[5usize] != 0 { b[5usize] } else { c[5usize] }, if a[6usize] != 0 { b[6usize] } else { c[6usize] }, if a[7usize] != 0 { b[7usize] } else { c[7usize] }, if a[8usize] != 0 { b[8usize] } else { c[8usize] }, if a[9usize] != 0 { b[9usize] } else { c[9usize] }, if a[10usize] != 0 { b[10usize] } else { c[10usize] }, if a[11usize] != 0 { b[11usize] } else { c[11usize] }, if a[12usize] != 0 { b[12usize] } else { c[12usize] }, if a[13usize] != 0 { b[13usize] } else { c[13usize] }, if a[14usize] != 0 { b[14usize] } else { c[14usize] }, if a[15usize] != 0 { b[15usize] } else { c[15usize] }, ] .simd_into(self) } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::min(a[0usize], b[0usize]), u8::min(a[1usize], b[1usize]), u8::min(a[2usize], b[2usize]), u8::min(a[3usize], b[3usize]), u8::min(a[4usize], b[4usize]), u8::min(a[5usize], b[5usize]), u8::min(a[6usize], b[6usize]), u8::min(a[7usize], b[7usize]), u8::min(a[8usize], b[8usize]), u8::min(a[9usize], b[9usize]), u8::min(a[10usize], b[10usize]), u8::min(a[11usize], b[11usize]), u8::min(a[12usize], b[12usize]), u8::min(a[13usize], b[13usize]), u8::min(a[14usize], b[14usize]), u8::min(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::max(a[0usize], b[0usize]), u8::max(a[1usize], b[1usize]), u8::max(a[2usize], b[2usize]), u8::max(a[3usize], b[3usize]), u8::max(a[4usize], b[4usize]), u8::max(a[5usize], b[5usize]), u8::max(a[6usize], b[6usize]), u8::max(a[7usize], b[7usize]), u8::max(a[8usize], b[8usize]), u8::max(a[9usize], b[9usize]), u8::max(a[10usize], b[10usize]), u8::max(a[11usize], b[11usize]), u8::max(a[12usize], b[12usize]), u8::max(a[13usize], b[13usize]), u8::max(a[14usize], b[14usize]), u8::max(a[15usize], b[15usize]), ] .simd_into(self) } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { [ a[0usize] as u16, a[1usize] as u16, a[2usize] as u16, a[3usize] as u16, a[4usize] as u16, a[5usize] as u16, a[6usize] as u16, a[7usize] as u16, a[8usize] as u16, a[9usize] as u16, a[10usize] as u16, a[11usize] as u16, a[12usize] as u16, a[13usize] as u16, a[14usize] as u16, a[15usize] as u16, ] .simd_into(self) } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_mask8x16(self, val: i8) -> mask8x16 { [val; 16usize].simd_into(self) } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { [ i8::not(a[0usize]), i8::not(a[1usize]), i8::not(a[2usize]), i8::not(a[3usize]), i8::not(a[4usize]), i8::not(a[5usize]), i8::not(a[6usize]), i8::not(a[7usize]), i8::not(a[8usize]), i8::not(a[9usize]), i8::not(a[10usize]), i8::not(a[11usize]), i8::not(a[12usize]), i8::not(a[13usize]), i8::not(a[14usize]), i8::not(a[15usize]), ] .simd_into(self) } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { [ i8::bitand(a[0usize], &b[0usize]), i8::bitand(a[1usize], &b[1usize]), i8::bitand(a[2usize], &b[2usize]), i8::bitand(a[3usize], &b[3usize]), i8::bitand(a[4usize], &b[4usize]), i8::bitand(a[5usize], &b[5usize]), i8::bitand(a[6usize], &b[6usize]), i8::bitand(a[7usize], &b[7usize]), i8::bitand(a[8usize], &b[8usize]), i8::bitand(a[9usize], &b[9usize]), i8::bitand(a[10usize], &b[10usize]), i8::bitand(a[11usize], &b[11usize]), i8::bitand(a[12usize], &b[12usize]), i8::bitand(a[13usize], &b[13usize]), i8::bitand(a[14usize], &b[14usize]), i8::bitand(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { [ i8::bitor(a[0usize], &b[0usize]), i8::bitor(a[1usize], &b[1usize]), i8::bitor(a[2usize], &b[2usize]), i8::bitor(a[3usize], &b[3usize]), i8::bitor(a[4usize], &b[4usize]), i8::bitor(a[5usize], &b[5usize]), i8::bitor(a[6usize], &b[6usize]), i8::bitor(a[7usize], &b[7usize]), i8::bitor(a[8usize], &b[8usize]), i8::bitor(a[9usize], &b[9usize]), i8::bitor(a[10usize], &b[10usize]), i8::bitor(a[11usize], &b[11usize]), i8::bitor(a[12usize], &b[12usize]), i8::bitor(a[13usize], &b[13usize]), i8::bitor(a[14usize], &b[14usize]), i8::bitor(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { [ i8::bitxor(a[0usize], &b[0usize]), i8::bitxor(a[1usize], &b[1usize]), i8::bitxor(a[2usize], &b[2usize]), i8::bitxor(a[3usize], &b[3usize]), i8::bitxor(a[4usize], &b[4usize]), i8::bitxor(a[5usize], &b[5usize]), i8::bitxor(a[6usize], &b[6usize]), i8::bitxor(a[7usize], &b[7usize]), i8::bitxor(a[8usize], &b[8usize]), i8::bitxor(a[9usize], &b[9usize]), i8::bitxor(a[10usize], &b[10usize]), i8::bitxor(a[11usize], &b[11usize]), i8::bitxor(a[12usize], &b[12usize]), i8::bitxor(a[13usize], &b[13usize]), i8::bitxor(a[14usize], &b[14usize]), i8::bitxor(a[15usize], &b[15usize]), ] .simd_into(self) } #[inline(always)] fn select_mask8x16( self, a: mask8x16, b: mask8x16, c: mask8x16, ) -> mask8x16 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, if a[4usize] != 0 { b[4usize] } else { c[4usize] }, if a[5usize] != 0 { b[5usize] } else { c[5usize] }, if a[6usize] != 0 { b[6usize] } else { c[6usize] }, if a[7usize] != 0 { b[7usize] } else { c[7usize] }, if a[8usize] != 0 { b[8usize] } else { c[8usize] }, if a[9usize] != 0 { b[9usize] } else { c[9usize] }, if a[10usize] != 0 { b[10usize] } else { c[10usize] }, if a[11usize] != 0 { b[11usize] } else { c[11usize] }, if a[12usize] != 0 { b[12usize] } else { c[12usize] }, if a[13usize] != 0 { b[13usize] } else { c[13usize] }, if a[14usize] != 0 { b[14usize] } else { c[14usize] }, if a[15usize] != 0 { b[15usize] } else { c[15usize] }, ] .simd_into(self) } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { [ -(i8::eq(&a[0usize], &b[0usize]) as i8), -(i8::eq(&a[1usize], &b[1usize]) as i8), -(i8::eq(&a[2usize], &b[2usize]) as i8), -(i8::eq(&a[3usize], &b[3usize]) as i8), -(i8::eq(&a[4usize], &b[4usize]) as i8), -(i8::eq(&a[5usize], &b[5usize]) as i8), -(i8::eq(&a[6usize], &b[6usize]) as i8), -(i8::eq(&a[7usize], &b[7usize]) as i8), -(i8::eq(&a[8usize], &b[8usize]) as i8), -(i8::eq(&a[9usize], &b[9usize]) as i8), -(i8::eq(&a[10usize], &b[10usize]) as i8), -(i8::eq(&a[11usize], &b[11usize]) as i8), -(i8::eq(&a[12usize], &b[12usize]) as i8), -(i8::eq(&a[13usize], &b[13usize]) as i8), -(i8::eq(&a[14usize], &b[14usize]) as i8), -(i8::eq(&a[15usize], &b[15usize]) as i8), ] .simd_into(self) } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { [val; 8usize].simd_into(self) } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { [ i16::not(a[0usize]), i16::not(a[1usize]), i16::not(a[2usize]), i16::not(a[3usize]), i16::not(a[4usize]), i16::not(a[5usize]), i16::not(a[6usize]), i16::not(a[7usize]), ] .simd_into(self) } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::wrapping_add(a[0usize], b[0usize]), i16::wrapping_add(a[1usize], b[1usize]), i16::wrapping_add(a[2usize], b[2usize]), i16::wrapping_add(a[3usize], b[3usize]), i16::wrapping_add(a[4usize], b[4usize]), i16::wrapping_add(a[5usize], b[5usize]), i16::wrapping_add(a[6usize], b[6usize]), i16::wrapping_add(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::wrapping_sub(a[0usize], b[0usize]), i16::wrapping_sub(a[1usize], b[1usize]), i16::wrapping_sub(a[2usize], b[2usize]), i16::wrapping_sub(a[3usize], b[3usize]), i16::wrapping_sub(a[4usize], b[4usize]), i16::wrapping_sub(a[5usize], b[5usize]), i16::wrapping_sub(a[6usize], b[6usize]), i16::wrapping_sub(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::wrapping_mul(a[0usize], b[0usize]), i16::wrapping_mul(a[1usize], b[1usize]), i16::wrapping_mul(a[2usize], b[2usize]), i16::wrapping_mul(a[3usize], b[3usize]), i16::wrapping_mul(a[4usize], b[4usize]), i16::wrapping_mul(a[5usize], b[5usize]), i16::wrapping_mul(a[6usize], b[6usize]), i16::wrapping_mul(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::bitand(a[0usize], &b[0usize]), i16::bitand(a[1usize], &b[1usize]), i16::bitand(a[2usize], &b[2usize]), i16::bitand(a[3usize], &b[3usize]), i16::bitand(a[4usize], &b[4usize]), i16::bitand(a[5usize], &b[5usize]), i16::bitand(a[6usize], &b[6usize]), i16::bitand(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::bitor(a[0usize], &b[0usize]), i16::bitor(a[1usize], &b[1usize]), i16::bitor(a[2usize], &b[2usize]), i16::bitor(a[3usize], &b[3usize]), i16::bitor(a[4usize], &b[4usize]), i16::bitor(a[5usize], &b[5usize]), i16::bitor(a[6usize], &b[6usize]), i16::bitor(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::bitxor(a[0usize], &b[0usize]), i16::bitxor(a[1usize], &b[1usize]), i16::bitxor(a[2usize], &b[2usize]), i16::bitxor(a[3usize], &b[3usize]), i16::bitxor(a[4usize], &b[4usize]), i16::bitxor(a[5usize], &b[5usize]), i16::bitxor(a[6usize], &b[6usize]), i16::bitxor(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { [ i16::shr(a[0usize], shift as i16), i16::shr(a[1usize], shift as i16), i16::shr(a[2usize], shift as i16), i16::shr(a[3usize], shift as i16), i16::shr(a[4usize], shift as i16), i16::shr(a[5usize], shift as i16), i16::shr(a[6usize], shift as i16), i16::shr(a[7usize], shift as i16), ] .simd_into(self) } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::shr(a[0usize], &b[0usize]), i16::shr(a[1usize], &b[1usize]), i16::shr(a[2usize], &b[2usize]), i16::shr(a[3usize], &b[3usize]), i16::shr(a[4usize], &b[4usize]), i16::shr(a[5usize], &b[5usize]), i16::shr(a[6usize], &b[6usize]), i16::shr(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { [ i16::shl(a[0usize], shift as i16), i16::shl(a[1usize], shift as i16), i16::shl(a[2usize], shift as i16), i16::shl(a[3usize], shift as i16), i16::shl(a[4usize], shift as i16), i16::shl(a[5usize], shift as i16), i16::shl(a[6usize], shift as i16), i16::shl(a[7usize], shift as i16), ] .simd_into(self) } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { [ -(i16::eq(&a[0usize], &b[0usize]) as i16), -(i16::eq(&a[1usize], &b[1usize]) as i16), -(i16::eq(&a[2usize], &b[2usize]) as i16), -(i16::eq(&a[3usize], &b[3usize]) as i16), -(i16::eq(&a[4usize], &b[4usize]) as i16), -(i16::eq(&a[5usize], &b[5usize]) as i16), -(i16::eq(&a[6usize], &b[6usize]) as i16), -(i16::eq(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { [ -(i16::lt(&a[0usize], &b[0usize]) as i16), -(i16::lt(&a[1usize], &b[1usize]) as i16), -(i16::lt(&a[2usize], &b[2usize]) as i16), -(i16::lt(&a[3usize], &b[3usize]) as i16), -(i16::lt(&a[4usize], &b[4usize]) as i16), -(i16::lt(&a[5usize], &b[5usize]) as i16), -(i16::lt(&a[6usize], &b[6usize]) as i16), -(i16::lt(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { [ -(i16::le(&a[0usize], &b[0usize]) as i16), -(i16::le(&a[1usize], &b[1usize]) as i16), -(i16::le(&a[2usize], &b[2usize]) as i16), -(i16::le(&a[3usize], &b[3usize]) as i16), -(i16::le(&a[4usize], &b[4usize]) as i16), -(i16::le(&a[5usize], &b[5usize]) as i16), -(i16::le(&a[6usize], &b[6usize]) as i16), -(i16::le(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { [ -(i16::ge(&a[0usize], &b[0usize]) as i16), -(i16::ge(&a[1usize], &b[1usize]) as i16), -(i16::ge(&a[2usize], &b[2usize]) as i16), -(i16::ge(&a[3usize], &b[3usize]) as i16), -(i16::ge(&a[4usize], &b[4usize]) as i16), -(i16::ge(&a[5usize], &b[5usize]) as i16), -(i16::ge(&a[6usize], &b[6usize]) as i16), -(i16::ge(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { [ -(i16::gt(&a[0usize], &b[0usize]) as i16), -(i16::gt(&a[1usize], &b[1usize]) as i16), -(i16::gt(&a[2usize], &b[2usize]) as i16), -(i16::gt(&a[3usize], &b[3usize]) as i16), -(i16::gt(&a[4usize], &b[4usize]) as i16), -(i16::gt(&a[5usize], &b[5usize]) as i16), -(i16::gt(&a[6usize], &b[6usize]) as i16), -(i16::gt(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ a[0usize], b[0usize], a[1usize], b[1usize], a[2usize], b[2usize], a[3usize], b[3usize], ] .simd_into(self) } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ a[4usize], b[4usize], a[5usize], b[5usize], a[6usize], b[6usize], a[7usize], b[7usize], ] .simd_into(self) } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ a[0usize], a[2usize], a[4usize], a[6usize], b[0usize], b[2usize], b[4usize], b[6usize], ] .simd_into(self) } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ a[1usize], a[3usize], a[5usize], a[7usize], b[1usize], b[3usize], b[5usize], b[7usize], ] .simd_into(self) } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, if a[4usize] != 0 { b[4usize] } else { c[4usize] }, if a[5usize] != 0 { b[5usize] } else { c[5usize] }, if a[6usize] != 0 { b[6usize] } else { c[6usize] }, if a[7usize] != 0 { b[7usize] } else { c[7usize] }, ] .simd_into(self) } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::min(a[0usize], b[0usize]), i16::min(a[1usize], b[1usize]), i16::min(a[2usize], b[2usize]), i16::min(a[3usize], b[3usize]), i16::min(a[4usize], b[4usize]), i16::min(a[5usize], b[5usize]), i16::min(a[6usize], b[6usize]), i16::min(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::max(a[0usize], b[0usize]), i16::max(a[1usize], b[1usize]), i16::max(a[2usize], b[2usize]), i16::max(a[3usize], b[3usize]), i16::max(a[4usize], b[4usize]), i16::max(a[5usize], b[5usize]), i16::max(a[6usize], b[6usize]), i16::max(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { [ i16::neg(a[0usize]), i16::neg(a[1usize]), i16::neg(a[2usize]), i16::neg(a[3usize]), i16::neg(a[4usize]), i16::neg(a[5usize]), i16::neg(a[6usize]), i16::neg(a[7usize]), ] .simd_into(self) } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { [val; 8usize].simd_into(self) } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { [ u16::not(a[0usize]), u16::not(a[1usize]), u16::not(a[2usize]), u16::not(a[3usize]), u16::not(a[4usize]), u16::not(a[5usize]), u16::not(a[6usize]), u16::not(a[7usize]), ] .simd_into(self) } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::wrapping_add(a[0usize], b[0usize]), u16::wrapping_add(a[1usize], b[1usize]), u16::wrapping_add(a[2usize], b[2usize]), u16::wrapping_add(a[3usize], b[3usize]), u16::wrapping_add(a[4usize], b[4usize]), u16::wrapping_add(a[5usize], b[5usize]), u16::wrapping_add(a[6usize], b[6usize]), u16::wrapping_add(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::wrapping_sub(a[0usize], b[0usize]), u16::wrapping_sub(a[1usize], b[1usize]), u16::wrapping_sub(a[2usize], b[2usize]), u16::wrapping_sub(a[3usize], b[3usize]), u16::wrapping_sub(a[4usize], b[4usize]), u16::wrapping_sub(a[5usize], b[5usize]), u16::wrapping_sub(a[6usize], b[6usize]), u16::wrapping_sub(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::wrapping_mul(a[0usize], b[0usize]), u16::wrapping_mul(a[1usize], b[1usize]), u16::wrapping_mul(a[2usize], b[2usize]), u16::wrapping_mul(a[3usize], b[3usize]), u16::wrapping_mul(a[4usize], b[4usize]), u16::wrapping_mul(a[5usize], b[5usize]), u16::wrapping_mul(a[6usize], b[6usize]), u16::wrapping_mul(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::bitand(a[0usize], &b[0usize]), u16::bitand(a[1usize], &b[1usize]), u16::bitand(a[2usize], &b[2usize]), u16::bitand(a[3usize], &b[3usize]), u16::bitand(a[4usize], &b[4usize]), u16::bitand(a[5usize], &b[5usize]), u16::bitand(a[6usize], &b[6usize]), u16::bitand(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::bitor(a[0usize], &b[0usize]), u16::bitor(a[1usize], &b[1usize]), u16::bitor(a[2usize], &b[2usize]), u16::bitor(a[3usize], &b[3usize]), u16::bitor(a[4usize], &b[4usize]), u16::bitor(a[5usize], &b[5usize]), u16::bitor(a[6usize], &b[6usize]), u16::bitor(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::bitxor(a[0usize], &b[0usize]), u16::bitxor(a[1usize], &b[1usize]), u16::bitxor(a[2usize], &b[2usize]), u16::bitxor(a[3usize], &b[3usize]), u16::bitxor(a[4usize], &b[4usize]), u16::bitxor(a[5usize], &b[5usize]), u16::bitxor(a[6usize], &b[6usize]), u16::bitxor(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { [ u16::shr(a[0usize], shift as u16), u16::shr(a[1usize], shift as u16), u16::shr(a[2usize], shift as u16), u16::shr(a[3usize], shift as u16), u16::shr(a[4usize], shift as u16), u16::shr(a[5usize], shift as u16), u16::shr(a[6usize], shift as u16), u16::shr(a[7usize], shift as u16), ] .simd_into(self) } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::shr(a[0usize], &b[0usize]), u16::shr(a[1usize], &b[1usize]), u16::shr(a[2usize], &b[2usize]), u16::shr(a[3usize], &b[3usize]), u16::shr(a[4usize], &b[4usize]), u16::shr(a[5usize], &b[5usize]), u16::shr(a[6usize], &b[6usize]), u16::shr(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { [ u16::shl(a[0usize], shift as u16), u16::shl(a[1usize], shift as u16), u16::shl(a[2usize], shift as u16), u16::shl(a[3usize], shift as u16), u16::shl(a[4usize], shift as u16), u16::shl(a[5usize], shift as u16), u16::shl(a[6usize], shift as u16), u16::shl(a[7usize], shift as u16), ] .simd_into(self) } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { [ -(u16::eq(&a[0usize], &b[0usize]) as i16), -(u16::eq(&a[1usize], &b[1usize]) as i16), -(u16::eq(&a[2usize], &b[2usize]) as i16), -(u16::eq(&a[3usize], &b[3usize]) as i16), -(u16::eq(&a[4usize], &b[4usize]) as i16), -(u16::eq(&a[5usize], &b[5usize]) as i16), -(u16::eq(&a[6usize], &b[6usize]) as i16), -(u16::eq(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { [ -(u16::lt(&a[0usize], &b[0usize]) as i16), -(u16::lt(&a[1usize], &b[1usize]) as i16), -(u16::lt(&a[2usize], &b[2usize]) as i16), -(u16::lt(&a[3usize], &b[3usize]) as i16), -(u16::lt(&a[4usize], &b[4usize]) as i16), -(u16::lt(&a[5usize], &b[5usize]) as i16), -(u16::lt(&a[6usize], &b[6usize]) as i16), -(u16::lt(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { [ -(u16::le(&a[0usize], &b[0usize]) as i16), -(u16::le(&a[1usize], &b[1usize]) as i16), -(u16::le(&a[2usize], &b[2usize]) as i16), -(u16::le(&a[3usize], &b[3usize]) as i16), -(u16::le(&a[4usize], &b[4usize]) as i16), -(u16::le(&a[5usize], &b[5usize]) as i16), -(u16::le(&a[6usize], &b[6usize]) as i16), -(u16::le(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { [ -(u16::ge(&a[0usize], &b[0usize]) as i16), -(u16::ge(&a[1usize], &b[1usize]) as i16), -(u16::ge(&a[2usize], &b[2usize]) as i16), -(u16::ge(&a[3usize], &b[3usize]) as i16), -(u16::ge(&a[4usize], &b[4usize]) as i16), -(u16::ge(&a[5usize], &b[5usize]) as i16), -(u16::ge(&a[6usize], &b[6usize]) as i16), -(u16::ge(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { [ -(u16::gt(&a[0usize], &b[0usize]) as i16), -(u16::gt(&a[1usize], &b[1usize]) as i16), -(u16::gt(&a[2usize], &b[2usize]) as i16), -(u16::gt(&a[3usize], &b[3usize]) as i16), -(u16::gt(&a[4usize], &b[4usize]) as i16), -(u16::gt(&a[5usize], &b[5usize]) as i16), -(u16::gt(&a[6usize], &b[6usize]) as i16), -(u16::gt(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ a[0usize], b[0usize], a[1usize], b[1usize], a[2usize], b[2usize], a[3usize], b[3usize], ] .simd_into(self) } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ a[4usize], b[4usize], a[5usize], b[5usize], a[6usize], b[6usize], a[7usize], b[7usize], ] .simd_into(self) } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ a[0usize], a[2usize], a[4usize], a[6usize], b[0usize], b[2usize], b[4usize], b[6usize], ] .simd_into(self) } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ a[1usize], a[3usize], a[5usize], a[7usize], b[1usize], b[3usize], b[5usize], b[7usize], ] .simd_into(self) } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, if a[4usize] != 0 { b[4usize] } else { c[4usize] }, if a[5usize] != 0 { b[5usize] } else { c[5usize] }, if a[6usize] != 0 { b[6usize] } else { c[6usize] }, if a[7usize] != 0 { b[7usize] } else { c[7usize] }, ] .simd_into(self) } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::min(a[0usize], b[0usize]), u16::min(a[1usize], b[1usize]), u16::min(a[2usize], b[2usize]), u16::min(a[3usize], b[3usize]), u16::min(a[4usize], b[4usize]), u16::min(a[5usize], b[5usize]), u16::min(a[6usize], b[6usize]), u16::min(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::max(a[0usize], b[0usize]), u16::max(a[1usize], b[1usize]), u16::max(a[2usize], b[2usize]), u16::max(a[3usize], b[3usize]), u16::max(a[4usize], b[4usize]), u16::max(a[5usize], b[5usize]), u16::max(a[6usize], b[6usize]), u16::max(a[7usize], b[7usize]), ] .simd_into(self) } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_mask16x8(self, val: i16) -> mask16x8 { [val; 8usize].simd_into(self) } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { [ i16::not(a[0usize]), i16::not(a[1usize]), i16::not(a[2usize]), i16::not(a[3usize]), i16::not(a[4usize]), i16::not(a[5usize]), i16::not(a[6usize]), i16::not(a[7usize]), ] .simd_into(self) } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { [ i16::bitand(a[0usize], &b[0usize]), i16::bitand(a[1usize], &b[1usize]), i16::bitand(a[2usize], &b[2usize]), i16::bitand(a[3usize], &b[3usize]), i16::bitand(a[4usize], &b[4usize]), i16::bitand(a[5usize], &b[5usize]), i16::bitand(a[6usize], &b[6usize]), i16::bitand(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { [ i16::bitor(a[0usize], &b[0usize]), i16::bitor(a[1usize], &b[1usize]), i16::bitor(a[2usize], &b[2usize]), i16::bitor(a[3usize], &b[3usize]), i16::bitor(a[4usize], &b[4usize]), i16::bitor(a[5usize], &b[5usize]), i16::bitor(a[6usize], &b[6usize]), i16::bitor(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { [ i16::bitxor(a[0usize], &b[0usize]), i16::bitxor(a[1usize], &b[1usize]), i16::bitxor(a[2usize], &b[2usize]), i16::bitxor(a[3usize], &b[3usize]), i16::bitxor(a[4usize], &b[4usize]), i16::bitxor(a[5usize], &b[5usize]), i16::bitxor(a[6usize], &b[6usize]), i16::bitxor(a[7usize], &b[7usize]), ] .simd_into(self) } #[inline(always)] fn select_mask16x8( self, a: mask16x8, b: mask16x8, c: mask16x8, ) -> mask16x8 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, if a[4usize] != 0 { b[4usize] } else { c[4usize] }, if a[5usize] != 0 { b[5usize] } else { c[5usize] }, if a[6usize] != 0 { b[6usize] } else { c[6usize] }, if a[7usize] != 0 { b[7usize] } else { c[7usize] }, ] .simd_into(self) } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { [ -(i16::eq(&a[0usize], &b[0usize]) as i16), -(i16::eq(&a[1usize], &b[1usize]) as i16), -(i16::eq(&a[2usize], &b[2usize]) as i16), -(i16::eq(&a[3usize], &b[3usize]) as i16), -(i16::eq(&a[4usize], &b[4usize]) as i16), -(i16::eq(&a[5usize], &b[5usize]) as i16), -(i16::eq(&a[6usize], &b[6usize]) as i16), -(i16::eq(&a[7usize], &b[7usize]) as i16), ] .simd_into(self) } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { [val; 4usize].simd_into(self) } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { [ i32::not(a[0usize]), i32::not(a[1usize]), i32::not(a[2usize]), i32::not(a[3usize]), ] .simd_into(self) } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::wrapping_add(a[0usize], b[0usize]), i32::wrapping_add(a[1usize], b[1usize]), i32::wrapping_add(a[2usize], b[2usize]), i32::wrapping_add(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::wrapping_sub(a[0usize], b[0usize]), i32::wrapping_sub(a[1usize], b[1usize]), i32::wrapping_sub(a[2usize], b[2usize]), i32::wrapping_sub(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::wrapping_mul(a[0usize], b[0usize]), i32::wrapping_mul(a[1usize], b[1usize]), i32::wrapping_mul(a[2usize], b[2usize]), i32::wrapping_mul(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::bitand(a[0usize], &b[0usize]), i32::bitand(a[1usize], &b[1usize]), i32::bitand(a[2usize], &b[2usize]), i32::bitand(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::bitor(a[0usize], &b[0usize]), i32::bitor(a[1usize], &b[1usize]), i32::bitor(a[2usize], &b[2usize]), i32::bitor(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::bitxor(a[0usize], &b[0usize]), i32::bitxor(a[1usize], &b[1usize]), i32::bitxor(a[2usize], &b[2usize]), i32::bitxor(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { [ i32::shr(a[0usize], shift as i32), i32::shr(a[1usize], shift as i32), i32::shr(a[2usize], shift as i32), i32::shr(a[3usize], shift as i32), ] .simd_into(self) } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::shr(a[0usize], &b[0usize]), i32::shr(a[1usize], &b[1usize]), i32::shr(a[2usize], &b[2usize]), i32::shr(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { [ i32::shl(a[0usize], shift as i32), i32::shl(a[1usize], shift as i32), i32::shl(a[2usize], shift as i32), i32::shl(a[3usize], shift as i32), ] .simd_into(self) } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { [ -(i32::eq(&a[0usize], &b[0usize]) as i32), -(i32::eq(&a[1usize], &b[1usize]) as i32), -(i32::eq(&a[2usize], &b[2usize]) as i32), -(i32::eq(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { [ -(i32::lt(&a[0usize], &b[0usize]) as i32), -(i32::lt(&a[1usize], &b[1usize]) as i32), -(i32::lt(&a[2usize], &b[2usize]) as i32), -(i32::lt(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { [ -(i32::le(&a[0usize], &b[0usize]) as i32), -(i32::le(&a[1usize], &b[1usize]) as i32), -(i32::le(&a[2usize], &b[2usize]) as i32), -(i32::le(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { [ -(i32::ge(&a[0usize], &b[0usize]) as i32), -(i32::ge(&a[1usize], &b[1usize]) as i32), -(i32::ge(&a[2usize], &b[2usize]) as i32), -(i32::ge(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { [ -(i32::gt(&a[0usize], &b[0usize]) as i32), -(i32::gt(&a[1usize], &b[1usize]) as i32), -(i32::gt(&a[2usize], &b[2usize]) as i32), -(i32::gt(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [a[0usize], b[0usize], a[1usize], b[1usize]].simd_into(self) } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [a[2usize], b[2usize], a[3usize], b[3usize]].simd_into(self) } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [a[0usize], a[2usize], b[0usize], b[2usize]].simd_into(self) } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [a[1usize], a[3usize], b[1usize], b[3usize]].simd_into(self) } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, ] .simd_into(self) } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::min(a[0usize], b[0usize]), i32::min(a[1usize], b[1usize]), i32::min(a[2usize], b[2usize]), i32::min(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::max(a[0usize], b[0usize]), i32::max(a[1usize], b[1usize]), i32::max(a[2usize], b[2usize]), i32::max(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { [ i32::neg(a[0usize]), i32::neg(a[1usize]), i32::neg(a[2usize]), i32::neg(a[3usize]), ] .simd_into(self) } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { [ a[0usize] as f32, a[1usize] as f32, a[2usize] as f32, a[3usize] as f32, ] .simd_into(self) } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { [val; 4usize].simd_into(self) } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { [ u32::not(a[0usize]), u32::not(a[1usize]), u32::not(a[2usize]), u32::not(a[3usize]), ] .simd_into(self) } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::wrapping_add(a[0usize], b[0usize]), u32::wrapping_add(a[1usize], b[1usize]), u32::wrapping_add(a[2usize], b[2usize]), u32::wrapping_add(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::wrapping_sub(a[0usize], b[0usize]), u32::wrapping_sub(a[1usize], b[1usize]), u32::wrapping_sub(a[2usize], b[2usize]), u32::wrapping_sub(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::wrapping_mul(a[0usize], b[0usize]), u32::wrapping_mul(a[1usize], b[1usize]), u32::wrapping_mul(a[2usize], b[2usize]), u32::wrapping_mul(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::bitand(a[0usize], &b[0usize]), u32::bitand(a[1usize], &b[1usize]), u32::bitand(a[2usize], &b[2usize]), u32::bitand(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::bitor(a[0usize], &b[0usize]), u32::bitor(a[1usize], &b[1usize]), u32::bitor(a[2usize], &b[2usize]), u32::bitor(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::bitxor(a[0usize], &b[0usize]), u32::bitxor(a[1usize], &b[1usize]), u32::bitxor(a[2usize], &b[2usize]), u32::bitxor(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { [ u32::shr(a[0usize], shift as u32), u32::shr(a[1usize], shift as u32), u32::shr(a[2usize], shift as u32), u32::shr(a[3usize], shift as u32), ] .simd_into(self) } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::shr(a[0usize], &b[0usize]), u32::shr(a[1usize], &b[1usize]), u32::shr(a[2usize], &b[2usize]), u32::shr(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { [ u32::shl(a[0usize], shift as u32), u32::shl(a[1usize], shift as u32), u32::shl(a[2usize], shift as u32), u32::shl(a[3usize], shift as u32), ] .simd_into(self) } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { [ -(u32::eq(&a[0usize], &b[0usize]) as i32), -(u32::eq(&a[1usize], &b[1usize]) as i32), -(u32::eq(&a[2usize], &b[2usize]) as i32), -(u32::eq(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { [ -(u32::lt(&a[0usize], &b[0usize]) as i32), -(u32::lt(&a[1usize], &b[1usize]) as i32), -(u32::lt(&a[2usize], &b[2usize]) as i32), -(u32::lt(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { [ -(u32::le(&a[0usize], &b[0usize]) as i32), -(u32::le(&a[1usize], &b[1usize]) as i32), -(u32::le(&a[2usize], &b[2usize]) as i32), -(u32::le(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { [ -(u32::ge(&a[0usize], &b[0usize]) as i32), -(u32::ge(&a[1usize], &b[1usize]) as i32), -(u32::ge(&a[2usize], &b[2usize]) as i32), -(u32::ge(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { [ -(u32::gt(&a[0usize], &b[0usize]) as i32), -(u32::gt(&a[1usize], &b[1usize]) as i32), -(u32::gt(&a[2usize], &b[2usize]) as i32), -(u32::gt(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [a[0usize], b[0usize], a[1usize], b[1usize]].simd_into(self) } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [a[2usize], b[2usize], a[3usize], b[3usize]].simd_into(self) } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [a[0usize], a[2usize], b[0usize], b[2usize]].simd_into(self) } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [a[1usize], a[3usize], b[1usize], b[3usize]].simd_into(self) } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, ] .simd_into(self) } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::min(a[0usize], b[0usize]), u32::min(a[1usize], b[1usize]), u32::min(a[2usize], b[2usize]), u32::min(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::max(a[0usize], b[0usize]), u32::max(a[1usize], b[1usize]), u32::max(a[2usize], b[2usize]), u32::max(a[3usize], b[3usize]), ] .simd_into(self) } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { [ a[0usize] as f32, a[1usize] as f32, a[2usize] as f32, a[3usize] as f32, ] .simd_into(self) } #[inline(always)] fn splat_mask32x4(self, val: i32) -> mask32x4 { [val; 4usize].simd_into(self) } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { [ i32::not(a[0usize]), i32::not(a[1usize]), i32::not(a[2usize]), i32::not(a[3usize]), ] .simd_into(self) } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { [ i32::bitand(a[0usize], &b[0usize]), i32::bitand(a[1usize], &b[1usize]), i32::bitand(a[2usize], &b[2usize]), i32::bitand(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { [ i32::bitor(a[0usize], &b[0usize]), i32::bitor(a[1usize], &b[1usize]), i32::bitor(a[2usize], &b[2usize]), i32::bitor(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { [ i32::bitxor(a[0usize], &b[0usize]), i32::bitxor(a[1usize], &b[1usize]), i32::bitxor(a[2usize], &b[2usize]), i32::bitxor(a[3usize], &b[3usize]), ] .simd_into(self) } #[inline(always)] fn select_mask32x4( self, a: mask32x4, b: mask32x4, c: mask32x4, ) -> mask32x4 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, if a[2usize] != 0 { b[2usize] } else { c[2usize] }, if a[3usize] != 0 { b[3usize] } else { c[3usize] }, ] .simd_into(self) } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { [ -(i32::eq(&a[0usize], &b[0usize]) as i32), -(i32::eq(&a[1usize], &b[1usize]) as i32), -(i32::eq(&a[2usize], &b[2usize]) as i32), -(i32::eq(&a[3usize], &b[3usize]) as i32), ] .simd_into(self) } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { [val; 2usize].simd_into(self) } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { [f64::abs(a[0usize]), f64::abs(a[1usize])].simd_into(self) } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { [f64::neg(a[0usize]), f64::neg(a[1usize])].simd_into(self) } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { [f64::sqrt(a[0usize]), f64::sqrt(a[1usize])].simd_into(self) } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::add(a[0usize], &b[0usize]), f64::add(a[1usize], &b[1usize]), ] .simd_into(self) } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::sub(a[0usize], &b[0usize]), f64::sub(a[1usize], &b[1usize]), ] .simd_into(self) } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::mul(a[0usize], &b[0usize]), f64::mul(a[1usize], &b[1usize]), ] .simd_into(self) } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::div(a[0usize], &b[0usize]), f64::div(a[1usize], &b[1usize]), ] .simd_into(self) } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::copysign(a[0usize], b[0usize]), f64::copysign(a[1usize], b[1usize]), ] .simd_into(self) } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { [ -(f64::eq(&a[0usize], &b[0usize]) as i64), -(f64::eq(&a[1usize], &b[1usize]) as i64), ] .simd_into(self) } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { [ -(f64::lt(&a[0usize], &b[0usize]) as i64), -(f64::lt(&a[1usize], &b[1usize]) as i64), ] .simd_into(self) } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { [ -(f64::le(&a[0usize], &b[0usize]) as i64), -(f64::le(&a[1usize], &b[1usize]) as i64), ] .simd_into(self) } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { [ -(f64::ge(&a[0usize], &b[0usize]) as i64), -(f64::ge(&a[1usize], &b[1usize]) as i64), ] .simd_into(self) } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { [ -(f64::gt(&a[0usize], &b[0usize]) as i64), -(f64::gt(&a[1usize], &b[1usize]) as i64), ] .simd_into(self) } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [a[0usize], b[0usize]].simd_into(self) } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [a[1usize], b[1usize]].simd_into(self) } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [a[0usize], b[0usize]].simd_into(self) } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [a[1usize], b[1usize]].simd_into(self) } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::max(a[0usize], b[0usize]), f64::max(a[1usize], b[1usize]), ] .simd_into(self) } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::max(a[0usize], b[0usize]), f64::max(a[1usize], b[1usize]), ] .simd_into(self) } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::min(a[0usize], b[0usize]), f64::min(a[1usize], b[1usize]), ] .simd_into(self) } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { [ f64::min(a[0usize], b[0usize]), f64::min(a[1usize], b[1usize]), ] .simd_into(self) } #[inline(always)] fn madd_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { a.mul(b).add(c) } #[inline(always)] fn msub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { a.mul(b).sub(c) } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { [f64::floor(a[0usize]), f64::floor(a[1usize])].simd_into(self) } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { [f64::fract(a[0usize]), f64::fract(a[1usize])].simd_into(self) } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { [f64::trunc(a[0usize]), f64::trunc(a[1usize])].simd_into(self) } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, ] .simd_into(self) } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { let mut result = [0.0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { f32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_mask64x2(self, val: i64) -> mask64x2 { [val; 2usize].simd_into(self) } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { [i64::not(a[0usize]), i64::not(a[1usize])].simd_into(self) } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { [ i64::bitand(a[0usize], &b[0usize]), i64::bitand(a[1usize], &b[1usize]), ] .simd_into(self) } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { [ i64::bitor(a[0usize], &b[0usize]), i64::bitor(a[1usize], &b[1usize]), ] .simd_into(self) } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { [ i64::bitxor(a[0usize], &b[0usize]), i64::bitxor(a[1usize], &b[1usize]), ] .simd_into(self) } #[inline(always)] fn select_mask64x2( self, a: mask64x2, b: mask64x2, c: mask64x2, ) -> mask64x2 { [ if a[0usize] != 0 { b[0usize] } else { c[0usize] }, if a[1usize] != 0 { b[1usize] } else { c[1usize] }, ] .simd_into(self) } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { [ -(i64::eq(&a[0usize], &b[0usize]) as i64), -(i64::eq(&a[1usize], &b[1usize]) as i64), ] .simd_into(self) } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { let mut result = [0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f32x8(self, a: f32) -> f32x8 { let half = self.splat_f32x4(a); self.combine_f32x4(half, half) } #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) } #[inline(always)] fn neg_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) } #[inline(always)] fn sqrt_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) } #[inline(always)] fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) } #[inline(always)] fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) } #[inline(always)] fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) } #[inline(always)] fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) } #[inline(always)] fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) } #[inline(always)] fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) } #[inline(always)] fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) } #[inline(always)] fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) } #[inline(always)] fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) } #[inline(always)] fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, _) = self.split_f32x8(a); let (b0, _) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) } #[inline(always)] fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (_, a1) = self.split_f32x8(a); let (_, b1) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) } #[inline(always)] fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) } #[inline(always)] fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) } #[inline(always)] fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) } #[inline(always)] fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.max_precise_f32x4(a0, b0), self.max_precise_f32x4(a1, b1), ) } #[inline(always)] fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.min_precise_f32x4(a0, b0), self.min_precise_f32x4(a1, b1), ) } #[inline(always)] fn madd_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1)) } #[inline(always)] fn msub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1)) } #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) } #[inline(always)] fn fract_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) } #[inline(always)] fn trunc_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) } #[inline(always)] fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) } #[inline(always)] fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { let mut result = [0.0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { let (a0, a1) = self.split_f32x8(a); self.combine_f64x2( self.reinterpret_f64_f32x4(a0), self.reinterpret_f64_f32x4(a1), ) } #[inline(always)] fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4( self.reinterpret_i32_f32x4(a0), self.reinterpret_i32_f32x4(a1), ) } #[inline(always)] fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { let (a0, a1) = self.split_f32x8(a); self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) } #[inline(always)] fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4( self.reinterpret_u32_f32x4(a0), self.reinterpret_u32_f32x4(a1), ) } #[inline(always)] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) } #[inline(always)] fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) } #[inline(always)] fn splat_i8x32(self, a: i8) -> i8x32 { let half = self.splat_i8x16(a); self.combine_i8x16(half, half) } #[inline(always)] fn not_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) } #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) } #[inline(always)] fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) } #[inline(always)] fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) } #[inline(always)] fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) } #[inline(always)] fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) } #[inline(always)] fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) } #[inline(always)] fn shr_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b)) } #[inline(always)] fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) } #[inline(always)] fn shl_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shl_i8x16(a0, b), self.shl_i8x16(a1, b)) } #[inline(always)] fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) } #[inline(always)] fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) } #[inline(always)] fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) } #[inline(always)] fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) } #[inline(always)] fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) } #[inline(always)] fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, _) = self.split_i8x32(a); let (b0, _) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) } #[inline(always)] fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (_, a1) = self.split_i8x32(a); let (_, b1) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) } #[inline(always)] fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) } #[inline(always)] fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) } #[inline(always)] fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_i8x32(b); let (c0, c1) = self.split_i8x32(c); self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) } #[inline(always)] fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) } #[inline(always)] fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) } #[inline(always)] fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) } #[inline(always)] fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) } #[inline(always)] fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { let (a0, a1) = self.split_i8x32(a); self.combine_u32x4( self.reinterpret_u32_i8x16(a0), self.reinterpret_u32_i8x16(a1), ) } #[inline(always)] fn splat_u8x32(self, a: u8) -> u8x32 { let half = self.splat_u8x16(a); self.combine_u8x16(half, half) } #[inline(always)] fn not_u8x32(self, a: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) } #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) } #[inline(always)] fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) } #[inline(always)] fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) } #[inline(always)] fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) } #[inline(always)] fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) } #[inline(always)] fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) } #[inline(always)] fn shr_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b)) } #[inline(always)] fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) } #[inline(always)] fn shl_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shl_u8x16(a0, b), self.shl_u8x16(a1, b)) } #[inline(always)] fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) } #[inline(always)] fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) } #[inline(always)] fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) } #[inline(always)] fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) } #[inline(always)] fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) } #[inline(always)] fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, _) = self.split_u8x32(a); let (b0, _) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) } #[inline(always)] fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (_, a1) = self.split_u8x32(a); let (_, b1) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) } #[inline(always)] fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) } #[inline(always)] fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) } #[inline(always)] fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_u8x32(b); let (c0, c1) = self.split_u8x32(c); self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) } #[inline(always)] fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) } #[inline(always)] fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) } #[inline(always)] fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn widen_u8x32(self, a: u8x32) -> u16x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) } #[inline(always)] fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { let (a0, a1) = self.split_u8x32(a); self.combine_u32x4( self.reinterpret_u32_u8x16(a0), self.reinterpret_u32_u8x16(a1), ) } #[inline(always)] fn splat_mask8x32(self, a: i8) -> mask8x32 { let half = self.splat_mask8x16(a); self.combine_mask8x16(half, half) } #[inline(always)] fn not_mask8x32(self, a: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) } #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) } #[inline(always)] fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) } #[inline(always)] fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) } #[inline(always)] fn select_mask8x32( self, a: mask8x32, b: mask8x32, c: mask8x32, ) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); let (c0, c1) = self.split_mask8x32(c); self.combine_mask8x16( self.select_mask8x16(a0, b0, c0), self.select_mask8x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) } #[inline(always)] fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x16(self, a: i16) -> i16x16 { let half = self.splat_i16x8(a); self.combine_i16x8(half, half) } #[inline(always)] fn not_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) } #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) } #[inline(always)] fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) } #[inline(always)] fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) } #[inline(always)] fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) } #[inline(always)] fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) } #[inline(always)] fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) } #[inline(always)] fn shr_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b)) } #[inline(always)] fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) } #[inline(always)] fn shl_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shl_i16x8(a0, b), self.shl_i16x8(a1, b)) } #[inline(always)] fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) } #[inline(always)] fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) } #[inline(always)] fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) } #[inline(always)] fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) } #[inline(always)] fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) } #[inline(always)] fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, _) = self.split_i16x16(a); let (b0, _) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) } #[inline(always)] fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (_, a1) = self.split_i16x16(a); let (_, b1) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) } #[inline(always)] fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) } #[inline(always)] fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) } #[inline(always)] fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_i16x16(b); let (c0, c1) = self.split_i16x16(c); self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) } #[inline(always)] fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) } #[inline(always)] fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) } #[inline(always)] fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) } #[inline(always)] fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { let (a0, a1) = self.split_i16x16(a); self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) } #[inline(always)] fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { let (a0, a1) = self.split_i16x16(a); self.combine_u32x4( self.reinterpret_u32_i16x8(a0), self.reinterpret_u32_i16x8(a1), ) } #[inline(always)] fn splat_u16x16(self, a: u16) -> u16x16 { let half = self.splat_u16x8(a); self.combine_u16x8(half, half) } #[inline(always)] fn not_u16x16(self, a: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) } #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) } #[inline(always)] fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) } #[inline(always)] fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) } #[inline(always)] fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) } #[inline(always)] fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) } #[inline(always)] fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) } #[inline(always)] fn shr_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b)) } #[inline(always)] fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) } #[inline(always)] fn shl_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shl_u16x8(a0, b), self.shl_u16x8(a1, b)) } #[inline(always)] fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) } #[inline(always)] fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) } #[inline(always)] fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) } #[inline(always)] fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) } #[inline(always)] fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) } #[inline(always)] fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, _) = self.split_u16x16(a); let (b0, _) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) } #[inline(always)] fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (_, a1) = self.split_u16x16(a); let (_, b1) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) } #[inline(always)] fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) } #[inline(always)] fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) } #[inline(always)] fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_u16x16(b); let (c0, c1) = self.split_u16x16(c); self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) } #[inline(always)] fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) } #[inline(always)] fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) } #[inline(always)] fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { [ a[0usize] as u8, a[1usize] as u8, a[2usize] as u8, a[3usize] as u8, a[4usize] as u8, a[5usize] as u8, a[6usize] as u8, a[7usize] as u8, a[8usize] as u8, a[9usize] as u8, a[10usize] as u8, a[11usize] as u8, a[12usize] as u8, a[13usize] as u8, a[14usize] as u8, a[15usize] as u8, ] .simd_into(self) } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { let (a0, a1) = self.split_u16x16(a); self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) } #[inline(always)] fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { let (a0, a1) = self.split_u16x16(a); self.combine_u32x4( self.reinterpret_u32_u16x8(a0), self.reinterpret_u32_u16x8(a1), ) } #[inline(always)] fn splat_mask16x16(self, a: i16) -> mask16x16 { let half = self.splat_mask16x8(a); self.combine_mask16x8(half, half) } #[inline(always)] fn not_mask16x16(self, a: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) } #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) } #[inline(always)] fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) } #[inline(always)] fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) } #[inline(always)] fn select_mask16x16( self, a: mask16x16, b: mask16x16, c: mask16x16, ) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); let (c0, c1) = self.split_mask16x16(c); self.combine_mask16x8( self.select_mask16x8(a0, b0, c0), self.select_mask16x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) } #[inline(always)] fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x8(self, a: i32) -> i32x8 { let half = self.splat_i32x4(a); self.combine_i32x4(half, half) } #[inline(always)] fn not_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) } #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) } #[inline(always)] fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) } #[inline(always)] fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) } #[inline(always)] fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) } #[inline(always)] fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) } #[inline(always)] fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) } #[inline(always)] fn shr_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b)) } #[inline(always)] fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) } #[inline(always)] fn shl_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shl_i32x4(a0, b), self.shl_i32x4(a1, b)) } #[inline(always)] fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) } #[inline(always)] fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) } #[inline(always)] fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) } #[inline(always)] fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) } #[inline(always)] fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) } #[inline(always)] fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, _) = self.split_i32x8(a); let (b0, _) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) } #[inline(always)] fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (_, a1) = self.split_i32x8(a); let (_, b1) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) } #[inline(always)] fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) } #[inline(always)] fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) } #[inline(always)] fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_i32x8(b); let (c0, c1) = self.split_i32x8(c); self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) } #[inline(always)] fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) } #[inline(always)] fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) } #[inline(always)] fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) } #[inline(always)] fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { let (a0, a1) = self.split_i32x8(a); self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) } #[inline(always)] fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_u32x4( self.reinterpret_u32_i32x4(a0), self.reinterpret_u32_i32x4(a1), ) } #[inline(always)] fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) } #[inline(always)] fn splat_u32x8(self, a: u32) -> u32x8 { let half = self.splat_u32x4(a); self.combine_u32x4(half, half) } #[inline(always)] fn not_u32x8(self, a: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) } #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) } #[inline(always)] fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) } #[inline(always)] fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) } #[inline(always)] fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) } #[inline(always)] fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) } #[inline(always)] fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) } #[inline(always)] fn shr_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b)) } #[inline(always)] fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) } #[inline(always)] fn shl_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shl_u32x4(a0, b), self.shl_u32x4(a1, b)) } #[inline(always)] fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) } #[inline(always)] fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) } #[inline(always)] fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) } #[inline(always)] fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) } #[inline(always)] fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) } #[inline(always)] fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, _) = self.split_u32x8(a); let (b0, _) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) } #[inline(always)] fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (_, a1) = self.split_u32x8(a); let (_, b1) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) } #[inline(always)] fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) } #[inline(always)] fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) } #[inline(always)] fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_u32x8(b); let (c0, c1) = self.split_u32x8(c); self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) } #[inline(always)] fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) } #[inline(always)] fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) } #[inline(always)] fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { let (a0, a1) = self.split_u32x8(a); self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) } #[inline(always)] fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) } #[inline(always)] fn splat_mask32x8(self, a: i32) -> mask32x8 { let half = self.splat_mask32x4(a); self.combine_mask32x4(half, half) } #[inline(always)] fn not_mask32x8(self, a: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) } #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) } #[inline(always)] fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) } #[inline(always)] fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) } #[inline(always)] fn select_mask32x8( self, a: mask32x8, b: mask32x8, c: mask32x8, ) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); let (c0, c1) = self.split_mask32x8(c); self.combine_mask32x4( self.select_mask32x4(a0, b0, c0), self.select_mask32x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) } #[inline(always)] fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x4(self, a: f64) -> f64x4 { let half = self.splat_f64x2(a); self.combine_f64x2(half, half) } #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) } #[inline(always)] fn neg_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) } #[inline(always)] fn sqrt_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) } #[inline(always)] fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) } #[inline(always)] fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) } #[inline(always)] fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) } #[inline(always)] fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) } #[inline(always)] fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) } #[inline(always)] fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) } #[inline(always)] fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) } #[inline(always)] fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) } #[inline(always)] fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) } #[inline(always)] fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, _) = self.split_f64x4(a); let (b0, _) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) } #[inline(always)] fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (_, a1) = self.split_f64x4(a); let (_, b1) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) } #[inline(always)] fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) } #[inline(always)] fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) } #[inline(always)] fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) } #[inline(always)] fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.max_precise_f64x2(a0, b0), self.max_precise_f64x2(a1, b1), ) } #[inline(always)] fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) } #[inline(always)] fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.min_precise_f64x2(a0, b0), self.min_precise_f64x2(a1, b1), ) } #[inline(always)] fn madd_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1)) } #[inline(always)] fn msub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1)) } #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) } #[inline(always)] fn fract_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) } #[inline(always)] fn trunc_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) } #[inline(always)] fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) } #[inline(always)] fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { let mut b0 = [0.0; 2usize]; let mut b1 = [0.0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { let (a0, a1) = self.split_f64x4(a); self.combine_f32x4( self.reinterpret_f32_f64x2(a0), self.reinterpret_f32_f64x2(a1), ) } #[inline(always)] fn splat_mask64x4(self, a: i64) -> mask64x4 { let half = self.splat_mask64x2(a); self.combine_mask64x2(half, half) } #[inline(always)] fn not_mask64x4(self, a: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) } #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) } #[inline(always)] fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) } #[inline(always)] fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) } #[inline(always)] fn select_mask64x4( self, a: mask64x4, b: mask64x4, c: mask64x4, ) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); let (c0, c1) = self.split_mask64x4(c); self.combine_mask64x2( self.select_mask64x2(a0, b0, c0), self.select_mask64x2(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) } #[inline(always)] fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { let mut b0 = [0; 2usize]; let mut b1 = [0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f32x16(self, a: f32) -> f32x16 { let half = self.splat_f32x8(a); self.combine_f32x8(half, half) } #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) } #[inline(always)] fn neg_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) } #[inline(always)] fn sqrt_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) } #[inline(always)] fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) } #[inline(always)] fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) } #[inline(always)] fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) } #[inline(always)] fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) } #[inline(always)] fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) } #[inline(always)] fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) } #[inline(always)] fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) } #[inline(always)] fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) } #[inline(always)] fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) } #[inline(always)] fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, _) = self.split_f32x16(a); let (b0, _) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) } #[inline(always)] fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (_, a1) = self.split_f32x16(a); let (_, b1) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) } #[inline(always)] fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) } #[inline(always)] fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) } #[inline(always)] fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) } #[inline(always)] fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.max_precise_f32x8(a0, b0), self.max_precise_f32x8(a1, b1), ) } #[inline(always)] fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) } #[inline(always)] fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.min_precise_f32x8(a0, b0), self.min_precise_f32x8(a1, b1), ) } #[inline(always)] fn madd_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1)) } #[inline(always)] fn msub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1)) } #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) } #[inline(always)] fn fract_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) } #[inline(always)] fn trunc_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) } #[inline(always)] fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) } #[inline(always)] fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { let mut b0 = [0.0; 8usize]; let mut b1 = [0.0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { let (a0, a1) = self.split_f32x16(a); self.combine_f64x4( self.reinterpret_f64_f32x8(a0), self.reinterpret_f64_f32x8(a1), ) } #[inline(always)] fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8( self.reinterpret_i32_f32x8(a0), self.reinterpret_i32_f32x8(a1), ) } #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { [ src[0usize], src[4usize], src[8usize], src[12usize], src[1usize], src[5usize], src[9usize], src[13usize], src[2usize], src[6usize], src[10usize], src[14usize], src[3usize], src[7usize], src[11usize], src[15usize], ] .simd_into(self) } #[inline(always)] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { *dest = [ a[0usize], a[4usize], a[8usize], a[12usize], a[1usize], a[5usize], a[9usize], a[13usize], a[2usize], a[6usize], a[10usize], a[14usize], a[3usize], a[7usize], a[11usize], a[15usize], ]; } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) } #[inline(always)] fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8( self.reinterpret_u32_f32x8(a0), self.reinterpret_u32_f32x8(a1), ) } #[inline(always)] fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) } #[inline(always)] fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) } #[inline(always)] fn splat_i8x64(self, a: i8) -> i8x64 { let half = self.splat_i8x32(a); self.combine_i8x32(half, half) } #[inline(always)] fn not_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) } #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) } #[inline(always)] fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) } #[inline(always)] fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) } #[inline(always)] fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) } #[inline(always)] fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) } #[inline(always)] fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) } #[inline(always)] fn shr_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b)) } #[inline(always)] fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) } #[inline(always)] fn shl_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shl_i8x32(a0, b), self.shl_i8x32(a1, b)) } #[inline(always)] fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) } #[inline(always)] fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) } #[inline(always)] fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) } #[inline(always)] fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) } #[inline(always)] fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) } #[inline(always)] fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, _) = self.split_i8x64(a); let (b0, _) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) } #[inline(always)] fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (_, a1) = self.split_i8x64(a); let (_, b1) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) } #[inline(always)] fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) } #[inline(always)] fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) } #[inline(always)] fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_i8x64(b); let (c0, c1) = self.split_i8x64(c); self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) } #[inline(always)] fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) } #[inline(always)] fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) } #[inline(always)] fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) } #[inline(always)] fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) } #[inline(always)] fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { let (a0, a1) = self.split_i8x64(a); self.combine_u32x8( self.reinterpret_u32_i8x32(a0), self.reinterpret_u32_i8x32(a1), ) } #[inline(always)] fn splat_u8x64(self, a: u8) -> u8x64 { let half = self.splat_u8x32(a); self.combine_u8x32(half, half) } #[inline(always)] fn not_u8x64(self, a: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) } #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) } #[inline(always)] fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) } #[inline(always)] fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) } #[inline(always)] fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) } #[inline(always)] fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) } #[inline(always)] fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) } #[inline(always)] fn shr_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b)) } #[inline(always)] fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) } #[inline(always)] fn shl_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shl_u8x32(a0, b), self.shl_u8x32(a1, b)) } #[inline(always)] fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) } #[inline(always)] fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) } #[inline(always)] fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) } #[inline(always)] fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) } #[inline(always)] fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) } #[inline(always)] fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, _) = self.split_u8x64(a); let (b0, _) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) } #[inline(always)] fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (_, a1) = self.split_u8x64(a); let (_, b1) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) } #[inline(always)] fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) } #[inline(always)] fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) } #[inline(always)] fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_u8x64(b); let (c0, c1) = self.split_u8x64(c); self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) } #[inline(always)] fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) } #[inline(always)] fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) } #[inline(always)] fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { [ src[0usize], src[4usize], src[8usize], src[12usize], src[16usize], src[20usize], src[24usize], src[28usize], src[32usize], src[36usize], src[40usize], src[44usize], src[48usize], src[52usize], src[56usize], src[60usize], src[1usize], src[5usize], src[9usize], src[13usize], src[17usize], src[21usize], src[25usize], src[29usize], src[33usize], src[37usize], src[41usize], src[45usize], src[49usize], src[53usize], src[57usize], src[61usize], src[2usize], src[6usize], src[10usize], src[14usize], src[18usize], src[22usize], src[26usize], src[30usize], src[34usize], src[38usize], src[42usize], src[46usize], src[50usize], src[54usize], src[58usize], src[62usize], src[3usize], src[7usize], src[11usize], src[15usize], src[19usize], src[23usize], src[27usize], src[31usize], src[35usize], src[39usize], src[43usize], src[47usize], src[51usize], src[55usize], src[59usize], src[63usize], ] .simd_into(self) } #[inline(always)] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { *dest = [ a[0usize], a[16usize], a[32usize], a[48usize], a[1usize], a[17usize], a[33usize], a[49usize], a[2usize], a[18usize], a[34usize], a[50usize], a[3usize], a[19usize], a[35usize], a[51usize], a[4usize], a[20usize], a[36usize], a[52usize], a[5usize], a[21usize], a[37usize], a[53usize], a[6usize], a[22usize], a[38usize], a[54usize], a[7usize], a[23usize], a[39usize], a[55usize], a[8usize], a[24usize], a[40usize], a[56usize], a[9usize], a[25usize], a[41usize], a[57usize], a[10usize], a[26usize], a[42usize], a[58usize], a[11usize], a[27usize], a[43usize], a[59usize], a[12usize], a[28usize], a[44usize], a[60usize], a[13usize], a[29usize], a[45usize], a[61usize], a[14usize], a[30usize], a[46usize], a[62usize], a[15usize], a[31usize], a[47usize], a[63usize], ]; } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { let (a0, a1) = self.split_u8x64(a); self.combine_u32x8( self.reinterpret_u32_u8x32(a0), self.reinterpret_u32_u8x32(a1), ) } #[inline(always)] fn splat_mask8x64(self, a: i8) -> mask8x64 { let half = self.splat_mask8x32(a); self.combine_mask8x32(half, half) } #[inline(always)] fn not_mask8x64(self, a: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) } #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) } #[inline(always)] fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) } #[inline(always)] fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) } #[inline(always)] fn select_mask8x64( self, a: mask8x64, b: mask8x64, c: mask8x64, ) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); let (c0, c1) = self.split_mask8x64(c); self.combine_mask8x32( self.select_mask8x32(a0, b0, c0), self.select_mask8x32(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) } #[inline(always)] fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x32(self, a: i16) -> i16x32 { let half = self.splat_i16x16(a); self.combine_i16x16(half, half) } #[inline(always)] fn not_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) } #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) } #[inline(always)] fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) } #[inline(always)] fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) } #[inline(always)] fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) } #[inline(always)] fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) } #[inline(always)] fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) } #[inline(always)] fn shr_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b)) } #[inline(always)] fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) } #[inline(always)] fn shl_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shl_i16x16(a0, b), self.shl_i16x16(a1, b)) } #[inline(always)] fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) } #[inline(always)] fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) } #[inline(always)] fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) } #[inline(always)] fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) } #[inline(always)] fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) } #[inline(always)] fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, _) = self.split_i16x32(a); let (b0, _) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) } #[inline(always)] fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (_, a1) = self.split_i16x32(a); let (_, b1) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) } #[inline(always)] fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) } #[inline(always)] fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16( self.unzip_high_i16x16(a0, a1), self.unzip_high_i16x16(b0, b1), ) } #[inline(always)] fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_i16x32(b); let (c0, c1) = self.split_i16x32(c); self.combine_i16x16( self.select_i16x16(a0, b0, c0), self.select_i16x16(a1, b1, c1), ) } #[inline(always)] fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) } #[inline(always)] fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) } #[inline(always)] fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) } #[inline(always)] fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { let (a0, a1) = self.split_i16x32(a); self.combine_u8x32( self.reinterpret_u8_i16x16(a0), self.reinterpret_u8_i16x16(a1), ) } #[inline(always)] fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { let (a0, a1) = self.split_i16x32(a); self.combine_u32x8( self.reinterpret_u32_i16x16(a0), self.reinterpret_u32_i16x16(a1), ) } #[inline(always)] fn splat_u16x32(self, a: u16) -> u16x32 { let half = self.splat_u16x16(a); self.combine_u16x16(half, half) } #[inline(always)] fn not_u16x32(self, a: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) } #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) } #[inline(always)] fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) } #[inline(always)] fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) } #[inline(always)] fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) } #[inline(always)] fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) } #[inline(always)] fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) } #[inline(always)] fn shr_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b)) } #[inline(always)] fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) } #[inline(always)] fn shl_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shl_u16x16(a0, b), self.shl_u16x16(a1, b)) } #[inline(always)] fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) } #[inline(always)] fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) } #[inline(always)] fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) } #[inline(always)] fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) } #[inline(always)] fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) } #[inline(always)] fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, _) = self.split_u16x32(a); let (b0, _) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) } #[inline(always)] fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (_, a1) = self.split_u16x32(a); let (_, b1) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) } #[inline(always)] fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) } #[inline(always)] fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16( self.unzip_high_u16x16(a0, a1), self.unzip_high_u16x16(b0, b1), ) } #[inline(always)] fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_u16x32(b); let (c0, c1) = self.split_u16x32(c); self.combine_u16x16( self.select_u16x16(a0, b0, c0), self.select_u16x16(a1, b1, c1), ) } #[inline(always)] fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) } #[inline(always)] fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) } #[inline(always)] fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { [ src[0usize], src[4usize], src[8usize], src[12usize], src[16usize], src[20usize], src[24usize], src[28usize], src[1usize], src[5usize], src[9usize], src[13usize], src[17usize], src[21usize], src[25usize], src[29usize], src[2usize], src[6usize], src[10usize], src[14usize], src[18usize], src[22usize], src[26usize], src[30usize], src[3usize], src[7usize], src[11usize], src[15usize], src[19usize], src[23usize], src[27usize], src[31usize], ] .simd_into(self) } #[inline(always)] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { *dest = [ a[0usize], a[8usize], a[16usize], a[24usize], a[1usize], a[9usize], a[17usize], a[25usize], a[2usize], a[10usize], a[18usize], a[26usize], a[3usize], a[11usize], a[19usize], a[27usize], a[4usize], a[12usize], a[20usize], a[28usize], a[5usize], a[13usize], a[21usize], a[29usize], a[6usize], a[14usize], a[22usize], a[30usize], a[7usize], a[15usize], a[23usize], a[31usize], ]; } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) } #[inline(always)] fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x32( self.reinterpret_u8_u16x16(a0), self.reinterpret_u8_u16x16(a1), ) } #[inline(always)] fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { let (a0, a1) = self.split_u16x32(a); self.combine_u32x8( self.reinterpret_u32_u16x16(a0), self.reinterpret_u32_u16x16(a1), ) } #[inline(always)] fn splat_mask16x32(self, a: i16) -> mask16x32 { let half = self.splat_mask16x16(a); self.combine_mask16x16(half, half) } #[inline(always)] fn not_mask16x32(self, a: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) } #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) } #[inline(always)] fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) } #[inline(always)] fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) } #[inline(always)] fn select_mask16x32( self, a: mask16x32, b: mask16x32, c: mask16x32, ) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); let (c0, c1) = self.split_mask16x32(c); self.combine_mask16x16( self.select_mask16x16(a0, b0, c0), self.select_mask16x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16( self.simd_eq_mask16x16(a0, b0), self.simd_eq_mask16x16(a1, b1), ) } #[inline(always)] fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x16(self, a: i32) -> i32x16 { let half = self.splat_i32x8(a); self.combine_i32x8(half, half) } #[inline(always)] fn not_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) } #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) } #[inline(always)] fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) } #[inline(always)] fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) } #[inline(always)] fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) } #[inline(always)] fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) } #[inline(always)] fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) } #[inline(always)] fn shr_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b)) } #[inline(always)] fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) } #[inline(always)] fn shl_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shl_i32x8(a0, b), self.shl_i32x8(a1, b)) } #[inline(always)] fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) } #[inline(always)] fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) } #[inline(always)] fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) } #[inline(always)] fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) } #[inline(always)] fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) } #[inline(always)] fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, _) = self.split_i32x16(a); let (b0, _) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) } #[inline(always)] fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (_, a1) = self.split_i32x16(a); let (_, b1) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) } #[inline(always)] fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) } #[inline(always)] fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) } #[inline(always)] fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_i32x16(b); let (c0, c1) = self.split_i32x16(c); self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) } #[inline(always)] fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) } #[inline(always)] fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) } #[inline(always)] fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) } #[inline(always)] fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { let (a0, a1) = self.split_i32x16(a); self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) } #[inline(always)] fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_u32x8( self.reinterpret_u32_i32x8(a0), self.reinterpret_u32_i32x8(a1), ) } #[inline(always)] fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) } #[inline(always)] fn splat_u32x16(self, a: u32) -> u32x16 { let half = self.splat_u32x8(a); self.combine_u32x8(half, half) } #[inline(always)] fn not_u32x16(self, a: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) } #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) } #[inline(always)] fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) } #[inline(always)] fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) } #[inline(always)] fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) } #[inline(always)] fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) } #[inline(always)] fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) } #[inline(always)] fn shr_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b)) } #[inline(always)] fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) } #[inline(always)] fn shl_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shl_u32x8(a0, b), self.shl_u32x8(a1, b)) } #[inline(always)] fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) } #[inline(always)] fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) } #[inline(always)] fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) } #[inline(always)] fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) } #[inline(always)] fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) } #[inline(always)] fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, _) = self.split_u32x16(a); let (b0, _) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) } #[inline(always)] fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (_, a1) = self.split_u32x16(a); let (_, b1) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) } #[inline(always)] fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) } #[inline(always)] fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) } #[inline(always)] fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_u32x16(b); let (c0, c1) = self.split_u32x16(c); self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) } #[inline(always)] fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) } #[inline(always)] fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) } #[inline(always)] fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { [ src[0usize], src[4usize], src[8usize], src[12usize], src[1usize], src[5usize], src[9usize], src[13usize], src[2usize], src[6usize], src[10usize], src[14usize], src[3usize], src[7usize], src[11usize], src[15usize], ] .simd_into(self) } #[inline(always)] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { *dest = [ a[0usize], a[4usize], a[8usize], a[12usize], a[1usize], a[5usize], a[9usize], a[13usize], a[2usize], a[6usize], a[10usize], a[14usize], a[3usize], a[7usize], a[11usize], a[15usize], ]; } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { let (a0, a1) = self.split_u32x16(a); self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) } #[inline(always)] fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) } #[inline(always)] fn splat_mask32x16(self, a: i32) -> mask32x16 { let half = self.splat_mask32x8(a); self.combine_mask32x8(half, half) } #[inline(always)] fn not_mask32x16(self, a: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) } #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) } #[inline(always)] fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) } #[inline(always)] fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) } #[inline(always)] fn select_mask32x16( self, a: mask32x16, b: mask32x16, c: mask32x16, ) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); let (c0, c1) = self.split_mask32x16(c); self.combine_mask32x8( self.select_mask32x8(a0, b0, c0), self.select_mask32x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) } #[inline(always)] fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x8(self, a: f64) -> f64x8 { let half = self.splat_f64x4(a); self.combine_f64x4(half, half) } #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) } #[inline(always)] fn neg_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) } #[inline(always)] fn sqrt_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) } #[inline(always)] fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) } #[inline(always)] fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) } #[inline(always)] fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) } #[inline(always)] fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) } #[inline(always)] fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) } #[inline(always)] fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) } #[inline(always)] fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) } #[inline(always)] fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) } #[inline(always)] fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) } #[inline(always)] fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, _) = self.split_f64x8(a); let (b0, _) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) } #[inline(always)] fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (_, a1) = self.split_f64x8(a); let (_, b1) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) } #[inline(always)] fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) } #[inline(always)] fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) } #[inline(always)] fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) } #[inline(always)] fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.max_precise_f64x4(a0, b0), self.max_precise_f64x4(a1, b1), ) } #[inline(always)] fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) } #[inline(always)] fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.min_precise_f64x4(a0, b0), self.min_precise_f64x4(a1, b1), ) } #[inline(always)] fn madd_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1)) } #[inline(always)] fn msub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1)) } #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) } #[inline(always)] fn fract_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) } #[inline(always)] fn trunc_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) } #[inline(always)] fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) } #[inline(always)] fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( self.reinterpret_f32_f64x4(a0), self.reinterpret_f32_f64x4(a1), ) } #[inline(always)] fn splat_mask64x8(self, a: i64) -> mask64x8 { let half = self.splat_mask64x4(a); self.combine_mask64x4(half, half) } #[inline(always)] fn not_mask64x8(self, a: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1)) } #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1)) } #[inline(always)] fn or_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1)) } #[inline(always)] fn xor_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1)) } #[inline(always)] fn select_mask64x8( self, a: mask64x8, b: mask64x8, c: mask64x8, ) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); let (c0, c1) = self.split_mask64x8(c); self.combine_mask64x4( self.select_mask64x4(a0, b0, c0), self.select_mask64x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1)) } #[inline(always)] fn split_mask64x8(self, a: mask64x8) -> (mask64x4, mask64x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } } fearless_simd-0.3.0/src/generated/neon.rs000064400000000000000000006234201046102023000165050ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT // This file is autogenerated by fearless_simd_gen use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, }; use core::arch::aarch64::*; #[doc = r#" The SIMD token for the "neon" level."#] #[derive(Clone, Copy, Debug)] pub struct Neon { pub neon: crate::core_arch::aarch64::Neon, } impl Neon { #[inline] pub const unsafe fn new_unchecked() -> Self { Neon { neon: unsafe { crate::core_arch::aarch64::Neon::new_unchecked() }, } } } impl Seal for Neon {} impl Simd for Neon { type f32s = f32x4; type u8s = u8x16; type i8s = i8x16; type u16s = u16x8; type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; #[inline(always)] fn level(self) -> Level { Level::Neon(self) } #[inline] fn vectorize R, R>(self, f: F) -> R { #[target_feature(enable = "neon")] #[inline] unsafe fn vectorize_neon R, R>(f: F) -> R { f() } unsafe { vectorize_neon(f) } } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { unsafe { vdupq_n_f32(val).simd_into(self) } } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { unsafe { vabsq_f32(a.into()).simd_into(self) } } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { unsafe { vnegq_f32(a.into()).simd_into(self) } } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { unsafe { vsqrtq_f32(a.into()).simd_into(self) } } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { vaddq_f32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { vsubq_f32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { vmulq_f32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { vdivq_f32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { let sign_mask = vdupq_n_u32(1 << 31); vbslq_f32(sign_mask, b.into(), a.into()).simd_into(self) } } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vceqq_f32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcltq_f32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcleq_f32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcgeq_f32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcgtq_f32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { let x = a.into(); let y = b.into(); unsafe { vzip1q_f32(x, y).simd_into(self) } } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { let x = a.into(); let y = b.into(); unsafe { vzip2q_f32(x, y).simd_into(self) } } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { let x = a.into(); let y = b.into(); unsafe { vuzp1q_f32(x, y).simd_into(self) } } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { let x = a.into(); let y = b.into(); unsafe { vuzp2q_f32(x, y).simd_into(self) } } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { vmaxq_f32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { vmaxnmq_f32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { vminq_f32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { vminnmq_f32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn madd_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { unsafe { vfmaq_f32(c.into(), b.into(), a.into()).simd_into(self) } } #[inline(always)] fn msub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { unsafe { vnegq_f32(vfmsq_f32(c.into(), b.into(), a.into())).simd_into(self) } } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { unsafe { vrndmq_f32(a.into()).simd_into(self) } } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { unsafe { let c1 = vcvtq_s32_f32(a.into()); let c2 = vcvtq_f32_s32(c1); vsubq_f32(a.into(), c2).simd_into(self) } } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { unsafe { vrndq_f32(a.into()).simd_into(self) } } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { unsafe { vbslq_f32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { unsafe { vreinterpretq_f64_f32(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { unsafe { vreinterpretq_s32_f32(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { unsafe { vreinterpretq_u8_f32(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { unsafe { vreinterpretq_u32_f32(a.into()).simd_into(self) } } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { unsafe { vcvtq_u32_f32(a.into()).simd_into(self) } } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { unsafe { vcvtq_s32_f32(a.into()).simd_into(self) } } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { unsafe { vdupq_n_s8(val).simd_into(self) } } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { unsafe { vmvnq_s8(a.into()).simd_into(self) } } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { vaddq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { vsubq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { vmulq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { vandq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { vorrq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { veorq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { unsafe { vshlq_s8(a.into(), vdupq_n_s8(-(shift as i8))).simd_into(self) } } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { vshlq_s8(a.into(), vnegq_s8(b.into())).simd_into(self) } } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { unsafe { vshlq_s8(a.into(), vdupq_n_s8(shift as i8)).simd_into(self) } } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vceqq_s8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vcltq_s8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vcleq_s8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vcgeq_s8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vcgtq_s8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { let x = a.into(); let y = b.into(); unsafe { vzip1q_s8(x, y).simd_into(self) } } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { let x = a.into(); let y = b.into(); unsafe { vzip2q_s8(x, y).simd_into(self) } } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { let x = a.into(); let y = b.into(); unsafe { vuzp1q_s8(x, y).simd_into(self) } } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { let x = a.into(); let y = b.into(); unsafe { vuzp2q_s8(x, y).simd_into(self) } } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { unsafe { vbslq_s8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { vminq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { vmaxq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { unsafe { vnegq_s8(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { unsafe { vreinterpretq_u8_s8(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { unsafe { vreinterpretq_u32_s8(a.into()).simd_into(self) } } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { unsafe { vdupq_n_u8(val).simd_into(self) } } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { unsafe { vmvnq_u8(a.into()).simd_into(self) } } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { vaddq_u8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { vsubq_u8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { vmulq_u8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { vandq_u8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { vorrq_u8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { veorq_u8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { unsafe { vshlq_u8(a.into(), vdupq_n_s8(-(shift as i8))).simd_into(self) } } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { vshlq_u8(a.into(), vnegq_s8(vreinterpretq_s8_u8(b.into()))).simd_into(self) } } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { unsafe { vshlq_u8(a.into(), vdupq_n_s8(shift as i8)).simd_into(self) } } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vceqq_u8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vcltq_u8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vcleq_u8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vcgeq_u8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vcgtq_u8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { let x = a.into(); let y = b.into(); unsafe { vzip1q_u8(x, y).simd_into(self) } } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { let x = a.into(); let y = b.into(); unsafe { vzip2q_u8(x, y).simd_into(self) } } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { let x = a.into(); let y = b.into(); unsafe { vuzp1q_u8(x, y).simd_into(self) } } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { let x = a.into(); let y = b.into(); unsafe { vuzp2q_u8(x, y).simd_into(self) } } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { unsafe { vbslq_u8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { vminq_u8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { vmaxq_u8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { unsafe { let low = vmovl_u8(vget_low_u8(a.into())); let high = vmovl_u8(vget_high_u8(a.into())); uint16x8x2_t(low, high).simd_into(self) } } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { unsafe { vreinterpretq_u32_u8(a.into()).simd_into(self) } } #[inline(always)] fn splat_mask8x16(self, val: i8) -> mask8x16 { unsafe { vdupq_n_s8(val).simd_into(self) } } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { unsafe { vmvnq_s8(a.into()).simd_into(self) } } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { vandq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { vorrq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { veorq_s8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask8x16( self, a: mask8x16, b: mask8x16, c: mask8x16, ) -> mask8x16 { unsafe { vbslq_s8(vreinterpretq_u8_s8(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { vreinterpretq_s8_u8(vceqq_s8(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { unsafe { vdupq_n_s16(val).simd_into(self) } } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { unsafe { vmvnq_s16(a.into()).simd_into(self) } } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { vaddq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { vsubq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { vmulq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { vandq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { vorrq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { veorq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { unsafe { vshlq_s16(a.into(), vdupq_n_s16(-(shift as i16))).simd_into(self) } } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { vshlq_s16(a.into(), vnegq_s16(b.into())).simd_into(self) } } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { unsafe { vshlq_s16(a.into(), vdupq_n_s16(shift as i16)).simd_into(self) } } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vceqq_s16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vcltq_s16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vcleq_s16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vcgeq_s16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vcgtq_s16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { let x = a.into(); let y = b.into(); unsafe { vzip1q_s16(x, y).simd_into(self) } } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { let x = a.into(); let y = b.into(); unsafe { vzip2q_s16(x, y).simd_into(self) } } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { let x = a.into(); let y = b.into(); unsafe { vuzp1q_s16(x, y).simd_into(self) } } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { let x = a.into(); let y = b.into(); unsafe { vuzp2q_s16(x, y).simd_into(self) } } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { unsafe { vbslq_s16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { vminq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { vmaxq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { unsafe { vnegq_s16(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { unsafe { vreinterpretq_u8_s16(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { unsafe { vreinterpretq_u32_s16(a.into()).simd_into(self) } } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { unsafe { vdupq_n_u16(val).simd_into(self) } } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { unsafe { vmvnq_u16(a.into()).simd_into(self) } } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { vaddq_u16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { vsubq_u16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { vmulq_u16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { vandq_u16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { vorrq_u16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { veorq_u16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { unsafe { vshlq_u16(a.into(), vdupq_n_s16(-(shift as i16))).simd_into(self) } } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { vshlq_u16(a.into(), vnegq_s16(vreinterpretq_s16_u16(b.into()))).simd_into(self) } } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { unsafe { vshlq_u16(a.into(), vdupq_n_s16(shift as i16)).simd_into(self) } } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vceqq_u16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vcltq_u16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vcleq_u16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vcgeq_u16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vcgtq_u16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { let x = a.into(); let y = b.into(); unsafe { vzip1q_u16(x, y).simd_into(self) } } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { let x = a.into(); let y = b.into(); unsafe { vzip2q_u16(x, y).simd_into(self) } } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { let x = a.into(); let y = b.into(); unsafe { vuzp1q_u16(x, y).simd_into(self) } } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { let x = a.into(); let y = b.into(); unsafe { vuzp2q_u16(x, y).simd_into(self) } } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { unsafe { vbslq_u16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { vminq_u16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { vmaxq_u16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { unsafe { vreinterpretq_u8_u16(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { unsafe { vreinterpretq_u32_u16(a.into()).simd_into(self) } } #[inline(always)] fn splat_mask16x8(self, val: i16) -> mask16x8 { unsafe { vdupq_n_s16(val).simd_into(self) } } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { unsafe { vmvnq_s16(a.into()).simd_into(self) } } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { vandq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { vorrq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { veorq_s16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask16x8( self, a: mask16x8, b: mask16x8, c: mask16x8, ) -> mask16x8 { unsafe { vbslq_s16(vreinterpretq_u16_s16(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { vreinterpretq_s16_u16(vceqq_s16(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { unsafe { vdupq_n_s32(val).simd_into(self) } } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { unsafe { vmvnq_s32(a.into()).simd_into(self) } } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { vaddq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { vsubq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { vmulq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { vandq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { vorrq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { veorq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { unsafe { vshlq_s32(a.into(), vdupq_n_s32(-(shift as i32))).simd_into(self) } } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { vshlq_s32(a.into(), vnegq_s32(b.into())).simd_into(self) } } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { unsafe { vshlq_s32(a.into(), vdupq_n_s32(shift as i32)).simd_into(self) } } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vceqq_s32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcltq_s32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcleq_s32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcgeq_s32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcgtq_s32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { let x = a.into(); let y = b.into(); unsafe { vzip1q_s32(x, y).simd_into(self) } } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { let x = a.into(); let y = b.into(); unsafe { vzip2q_s32(x, y).simd_into(self) } } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { let x = a.into(); let y = b.into(); unsafe { vuzp1q_s32(x, y).simd_into(self) } } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { let x = a.into(); let y = b.into(); unsafe { vuzp2q_s32(x, y).simd_into(self) } } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { unsafe { vbslq_s32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { vminq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { vmaxq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { unsafe { vnegq_s32(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { unsafe { vreinterpretq_u8_s32(a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { unsafe { vreinterpretq_u32_s32(a.into()).simd_into(self) } } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { unsafe { vcvtq_f32_s32(a.into()).simd_into(self) } } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { unsafe { vdupq_n_u32(val).simd_into(self) } } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { unsafe { vmvnq_u32(a.into()).simd_into(self) } } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { vaddq_u32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { vsubq_u32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { vmulq_u32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { vandq_u32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { vorrq_u32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { veorq_u32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { unsafe { vshlq_u32(a.into(), vdupq_n_s32(-(shift as i32))).simd_into(self) } } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { vshlq_u32(a.into(), vnegq_s32(vreinterpretq_s32_u32(b.into()))).simd_into(self) } } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { unsafe { vshlq_u32(a.into(), vdupq_n_s32(shift as i32)).simd_into(self) } } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vceqq_u32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcltq_u32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcleq_u32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcgeq_u32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vcgtq_u32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { let x = a.into(); let y = b.into(); unsafe { vzip1q_u32(x, y).simd_into(self) } } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { let x = a.into(); let y = b.into(); unsafe { vzip2q_u32(x, y).simd_into(self) } } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { let x = a.into(); let y = b.into(); unsafe { vuzp1q_u32(x, y).simd_into(self) } } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { let x = a.into(); let y = b.into(); unsafe { vuzp2q_u32(x, y).simd_into(self) } } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { unsafe { vbslq_u32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { vminq_u32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { vmaxq_u32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { unsafe { vreinterpretq_u8_u32(a.into()).simd_into(self) } } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { unsafe { vcvtq_f32_u32(a.into()).simd_into(self) } } #[inline(always)] fn splat_mask32x4(self, val: i32) -> mask32x4 { unsafe { vdupq_n_s32(val).simd_into(self) } } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { unsafe { vmvnq_s32(a.into()).simd_into(self) } } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { vandq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { vorrq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { veorq_s32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask32x4( self, a: mask32x4, b: mask32x4, c: mask32x4, ) -> mask32x4 { unsafe { vbslq_s32(vreinterpretq_u32_s32(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { vreinterpretq_s32_u32(vceqq_s32(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { unsafe { vdupq_n_f64(val).simd_into(self) } } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { unsafe { vabsq_f64(a.into()).simd_into(self) } } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { unsafe { vnegq_f64(a.into()).simd_into(self) } } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { unsafe { vsqrtq_f64(a.into()).simd_into(self) } } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { vaddq_f64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { vsubq_f64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { vmulq_f64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { vdivq_f64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { let sign_mask = vdupq_n_u64(1 << 63); vbslq_f64(sign_mask, b.into(), a.into()).simd_into(self) } } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { vreinterpretq_s64_u64(vceqq_f64(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { vreinterpretq_s64_u64(vcltq_f64(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { vreinterpretq_s64_u64(vcleq_f64(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { vreinterpretq_s64_u64(vcgeq_f64(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { vreinterpretq_s64_u64(vcgtq_f64(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { let x = a.into(); let y = b.into(); unsafe { vzip1q_f64(x, y).simd_into(self) } } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { let x = a.into(); let y = b.into(); unsafe { vzip2q_f64(x, y).simd_into(self) } } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { let x = a.into(); let y = b.into(); unsafe { vuzp1q_f64(x, y).simd_into(self) } } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { let x = a.into(); let y = b.into(); unsafe { vuzp2q_f64(x, y).simd_into(self) } } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { vmaxq_f64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { vmaxnmq_f64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { vminq_f64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { vminnmq_f64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn madd_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { unsafe { vfmaq_f64(c.into(), b.into(), a.into()).simd_into(self) } } #[inline(always)] fn msub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { unsafe { vnegq_f64(vfmsq_f64(c.into(), b.into(), a.into())).simd_into(self) } } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { unsafe { vrndmq_f64(a.into()).simd_into(self) } } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { unsafe { let c1 = vcvtq_s64_f64(a.into()); let c2 = vcvtq_f64_s64(c1); vsubq_f64(a.into(), c2).simd_into(self) } } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { unsafe { vrndq_f64(a.into()).simd_into(self) } } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { unsafe { vbslq_f64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { let mut result = [0.0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { unsafe { vreinterpretq_f32_f64(a.into()).simd_into(self) } } #[inline(always)] fn splat_mask64x2(self, val: i64) -> mask64x2 { unsafe { vdupq_n_s64(val).simd_into(self) } } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { unsafe { vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a.into()))).simd_into(self) } } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { vandq_s64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { vorrq_s64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { veorq_s64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask64x2( self, a: mask64x2, b: mask64x2, c: mask64x2, ) -> mask64x2 { unsafe { vbslq_s64(vreinterpretq_u64_s64(a.into()), b.into(), c.into()).simd_into(self) } } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { vreinterpretq_s64_u64(vceqq_s64(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { let mut result = [0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f32x8(self, a: f32) -> f32x8 { let half = self.splat_f32x4(a); self.combine_f32x4(half, half) } #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) } #[inline(always)] fn neg_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) } #[inline(always)] fn sqrt_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) } #[inline(always)] fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) } #[inline(always)] fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) } #[inline(always)] fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) } #[inline(always)] fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) } #[inline(always)] fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) } #[inline(always)] fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) } #[inline(always)] fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) } #[inline(always)] fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) } #[inline(always)] fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) } #[inline(always)] fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, _) = self.split_f32x8(a); let (b0, _) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) } #[inline(always)] fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (_, a1) = self.split_f32x8(a); let (_, b1) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) } #[inline(always)] fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) } #[inline(always)] fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) } #[inline(always)] fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) } #[inline(always)] fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.max_precise_f32x4(a0, b0), self.max_precise_f32x4(a1, b1), ) } #[inline(always)] fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.min_precise_f32x4(a0, b0), self.min_precise_f32x4(a1, b1), ) } #[inline(always)] fn madd_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1)) } #[inline(always)] fn msub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1)) } #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) } #[inline(always)] fn fract_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) } #[inline(always)] fn trunc_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) } #[inline(always)] fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) } #[inline(always)] fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { let mut result = [0.0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { let (a0, a1) = self.split_f32x8(a); self.combine_f64x2( self.reinterpret_f64_f32x4(a0), self.reinterpret_f64_f32x4(a1), ) } #[inline(always)] fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4( self.reinterpret_i32_f32x4(a0), self.reinterpret_i32_f32x4(a1), ) } #[inline(always)] fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { let (a0, a1) = self.split_f32x8(a); self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) } #[inline(always)] fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4( self.reinterpret_u32_f32x4(a0), self.reinterpret_u32_f32x4(a1), ) } #[inline(always)] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) } #[inline(always)] fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) } #[inline(always)] fn splat_i8x32(self, a: i8) -> i8x32 { let half = self.splat_i8x16(a); self.combine_i8x16(half, half) } #[inline(always)] fn not_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) } #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) } #[inline(always)] fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) } #[inline(always)] fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) } #[inline(always)] fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) } #[inline(always)] fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) } #[inline(always)] fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) } #[inline(always)] fn shr_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b)) } #[inline(always)] fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) } #[inline(always)] fn shl_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shl_i8x16(a0, b), self.shl_i8x16(a1, b)) } #[inline(always)] fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) } #[inline(always)] fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) } #[inline(always)] fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) } #[inline(always)] fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) } #[inline(always)] fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) } #[inline(always)] fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, _) = self.split_i8x32(a); let (b0, _) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) } #[inline(always)] fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (_, a1) = self.split_i8x32(a); let (_, b1) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) } #[inline(always)] fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) } #[inline(always)] fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) } #[inline(always)] fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_i8x32(b); let (c0, c1) = self.split_i8x32(c); self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) } #[inline(always)] fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) } #[inline(always)] fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) } #[inline(always)] fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) } #[inline(always)] fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) } #[inline(always)] fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { let (a0, a1) = self.split_i8x32(a); self.combine_u32x4( self.reinterpret_u32_i8x16(a0), self.reinterpret_u32_i8x16(a1), ) } #[inline(always)] fn splat_u8x32(self, a: u8) -> u8x32 { let half = self.splat_u8x16(a); self.combine_u8x16(half, half) } #[inline(always)] fn not_u8x32(self, a: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) } #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) } #[inline(always)] fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) } #[inline(always)] fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) } #[inline(always)] fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) } #[inline(always)] fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) } #[inline(always)] fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) } #[inline(always)] fn shr_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b)) } #[inline(always)] fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) } #[inline(always)] fn shl_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shl_u8x16(a0, b), self.shl_u8x16(a1, b)) } #[inline(always)] fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) } #[inline(always)] fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) } #[inline(always)] fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) } #[inline(always)] fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) } #[inline(always)] fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) } #[inline(always)] fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, _) = self.split_u8x32(a); let (b0, _) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) } #[inline(always)] fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (_, a1) = self.split_u8x32(a); let (_, b1) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) } #[inline(always)] fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) } #[inline(always)] fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) } #[inline(always)] fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_u8x32(b); let (c0, c1) = self.split_u8x32(c); self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) } #[inline(always)] fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) } #[inline(always)] fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) } #[inline(always)] fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn widen_u8x32(self, a: u8x32) -> u16x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) } #[inline(always)] fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { let (a0, a1) = self.split_u8x32(a); self.combine_u32x4( self.reinterpret_u32_u8x16(a0), self.reinterpret_u32_u8x16(a1), ) } #[inline(always)] fn splat_mask8x32(self, a: i8) -> mask8x32 { let half = self.splat_mask8x16(a); self.combine_mask8x16(half, half) } #[inline(always)] fn not_mask8x32(self, a: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) } #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) } #[inline(always)] fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) } #[inline(always)] fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) } #[inline(always)] fn select_mask8x32( self, a: mask8x32, b: mask8x32, c: mask8x32, ) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); let (c0, c1) = self.split_mask8x32(c); self.combine_mask8x16( self.select_mask8x16(a0, b0, c0), self.select_mask8x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) } #[inline(always)] fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x16(self, a: i16) -> i16x16 { let half = self.splat_i16x8(a); self.combine_i16x8(half, half) } #[inline(always)] fn not_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) } #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) } #[inline(always)] fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) } #[inline(always)] fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) } #[inline(always)] fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) } #[inline(always)] fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) } #[inline(always)] fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) } #[inline(always)] fn shr_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b)) } #[inline(always)] fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) } #[inline(always)] fn shl_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shl_i16x8(a0, b), self.shl_i16x8(a1, b)) } #[inline(always)] fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) } #[inline(always)] fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) } #[inline(always)] fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) } #[inline(always)] fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) } #[inline(always)] fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) } #[inline(always)] fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, _) = self.split_i16x16(a); let (b0, _) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) } #[inline(always)] fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (_, a1) = self.split_i16x16(a); let (_, b1) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) } #[inline(always)] fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) } #[inline(always)] fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) } #[inline(always)] fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_i16x16(b); let (c0, c1) = self.split_i16x16(c); self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) } #[inline(always)] fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) } #[inline(always)] fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) } #[inline(always)] fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) } #[inline(always)] fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { let (a0, a1) = self.split_i16x16(a); self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) } #[inline(always)] fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { let (a0, a1) = self.split_i16x16(a); self.combine_u32x4( self.reinterpret_u32_i16x8(a0), self.reinterpret_u32_i16x8(a1), ) } #[inline(always)] fn splat_u16x16(self, a: u16) -> u16x16 { let half = self.splat_u16x8(a); self.combine_u16x8(half, half) } #[inline(always)] fn not_u16x16(self, a: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) } #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) } #[inline(always)] fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) } #[inline(always)] fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) } #[inline(always)] fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) } #[inline(always)] fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) } #[inline(always)] fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) } #[inline(always)] fn shr_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b)) } #[inline(always)] fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) } #[inline(always)] fn shl_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shl_u16x8(a0, b), self.shl_u16x8(a1, b)) } #[inline(always)] fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) } #[inline(always)] fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) } #[inline(always)] fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) } #[inline(always)] fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) } #[inline(always)] fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) } #[inline(always)] fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, _) = self.split_u16x16(a); let (b0, _) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) } #[inline(always)] fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (_, a1) = self.split_u16x16(a); let (_, b1) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) } #[inline(always)] fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) } #[inline(always)] fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) } #[inline(always)] fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_u16x16(b); let (c0, c1) = self.split_u16x16(c); self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) } #[inline(always)] fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) } #[inline(always)] fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) } #[inline(always)] fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { unsafe { let converted: uint16x8x2_t = a.into(); let low = vmovn_u16(converted.0); let high = vmovn_u16(converted.1); vcombine_u8(low, high).simd_into(self) } } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { let (a0, a1) = self.split_u16x16(a); self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) } #[inline(always)] fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { let (a0, a1) = self.split_u16x16(a); self.combine_u32x4( self.reinterpret_u32_u16x8(a0), self.reinterpret_u32_u16x8(a1), ) } #[inline(always)] fn splat_mask16x16(self, a: i16) -> mask16x16 { let half = self.splat_mask16x8(a); self.combine_mask16x8(half, half) } #[inline(always)] fn not_mask16x16(self, a: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) } #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) } #[inline(always)] fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) } #[inline(always)] fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) } #[inline(always)] fn select_mask16x16( self, a: mask16x16, b: mask16x16, c: mask16x16, ) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); let (c0, c1) = self.split_mask16x16(c); self.combine_mask16x8( self.select_mask16x8(a0, b0, c0), self.select_mask16x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) } #[inline(always)] fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x8(self, a: i32) -> i32x8 { let half = self.splat_i32x4(a); self.combine_i32x4(half, half) } #[inline(always)] fn not_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) } #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) } #[inline(always)] fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) } #[inline(always)] fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) } #[inline(always)] fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) } #[inline(always)] fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) } #[inline(always)] fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) } #[inline(always)] fn shr_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b)) } #[inline(always)] fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) } #[inline(always)] fn shl_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shl_i32x4(a0, b), self.shl_i32x4(a1, b)) } #[inline(always)] fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) } #[inline(always)] fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) } #[inline(always)] fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) } #[inline(always)] fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) } #[inline(always)] fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) } #[inline(always)] fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, _) = self.split_i32x8(a); let (b0, _) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) } #[inline(always)] fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (_, a1) = self.split_i32x8(a); let (_, b1) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) } #[inline(always)] fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) } #[inline(always)] fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) } #[inline(always)] fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_i32x8(b); let (c0, c1) = self.split_i32x8(c); self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) } #[inline(always)] fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) } #[inline(always)] fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) } #[inline(always)] fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) } #[inline(always)] fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { let (a0, a1) = self.split_i32x8(a); self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) } #[inline(always)] fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_u32x4( self.reinterpret_u32_i32x4(a0), self.reinterpret_u32_i32x4(a1), ) } #[inline(always)] fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) } #[inline(always)] fn splat_u32x8(self, a: u32) -> u32x8 { let half = self.splat_u32x4(a); self.combine_u32x4(half, half) } #[inline(always)] fn not_u32x8(self, a: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) } #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) } #[inline(always)] fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) } #[inline(always)] fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) } #[inline(always)] fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) } #[inline(always)] fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) } #[inline(always)] fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) } #[inline(always)] fn shr_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b)) } #[inline(always)] fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) } #[inline(always)] fn shl_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shl_u32x4(a0, b), self.shl_u32x4(a1, b)) } #[inline(always)] fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) } #[inline(always)] fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) } #[inline(always)] fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) } #[inline(always)] fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) } #[inline(always)] fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) } #[inline(always)] fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, _) = self.split_u32x8(a); let (b0, _) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) } #[inline(always)] fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (_, a1) = self.split_u32x8(a); let (_, b1) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) } #[inline(always)] fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) } #[inline(always)] fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) } #[inline(always)] fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_u32x8(b); let (c0, c1) = self.split_u32x8(c); self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) } #[inline(always)] fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) } #[inline(always)] fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) } #[inline(always)] fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { let (a0, a1) = self.split_u32x8(a); self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) } #[inline(always)] fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) } #[inline(always)] fn splat_mask32x8(self, a: i32) -> mask32x8 { let half = self.splat_mask32x4(a); self.combine_mask32x4(half, half) } #[inline(always)] fn not_mask32x8(self, a: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) } #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) } #[inline(always)] fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) } #[inline(always)] fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) } #[inline(always)] fn select_mask32x8( self, a: mask32x8, b: mask32x8, c: mask32x8, ) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); let (c0, c1) = self.split_mask32x8(c); self.combine_mask32x4( self.select_mask32x4(a0, b0, c0), self.select_mask32x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) } #[inline(always)] fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x4(self, a: f64) -> f64x4 { let half = self.splat_f64x2(a); self.combine_f64x2(half, half) } #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) } #[inline(always)] fn neg_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) } #[inline(always)] fn sqrt_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) } #[inline(always)] fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) } #[inline(always)] fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) } #[inline(always)] fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) } #[inline(always)] fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) } #[inline(always)] fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) } #[inline(always)] fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) } #[inline(always)] fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) } #[inline(always)] fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) } #[inline(always)] fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) } #[inline(always)] fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, _) = self.split_f64x4(a); let (b0, _) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) } #[inline(always)] fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (_, a1) = self.split_f64x4(a); let (_, b1) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) } #[inline(always)] fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) } #[inline(always)] fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) } #[inline(always)] fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) } #[inline(always)] fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.max_precise_f64x2(a0, b0), self.max_precise_f64x2(a1, b1), ) } #[inline(always)] fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) } #[inline(always)] fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.min_precise_f64x2(a0, b0), self.min_precise_f64x2(a1, b1), ) } #[inline(always)] fn madd_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1)) } #[inline(always)] fn msub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1)) } #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) } #[inline(always)] fn fract_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) } #[inline(always)] fn trunc_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) } #[inline(always)] fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) } #[inline(always)] fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { let mut b0 = [0.0; 2usize]; let mut b1 = [0.0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { let (a0, a1) = self.split_f64x4(a); self.combine_f32x4( self.reinterpret_f32_f64x2(a0), self.reinterpret_f32_f64x2(a1), ) } #[inline(always)] fn splat_mask64x4(self, a: i64) -> mask64x4 { let half = self.splat_mask64x2(a); self.combine_mask64x2(half, half) } #[inline(always)] fn not_mask64x4(self, a: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) } #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) } #[inline(always)] fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) } #[inline(always)] fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) } #[inline(always)] fn select_mask64x4( self, a: mask64x4, b: mask64x4, c: mask64x4, ) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); let (c0, c1) = self.split_mask64x4(c); self.combine_mask64x2( self.select_mask64x2(a0, b0, c0), self.select_mask64x2(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) } #[inline(always)] fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { let mut b0 = [0; 2usize]; let mut b1 = [0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f32x16(self, a: f32) -> f32x16 { let half = self.splat_f32x8(a); self.combine_f32x8(half, half) } #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) } #[inline(always)] fn neg_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) } #[inline(always)] fn sqrt_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) } #[inline(always)] fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) } #[inline(always)] fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) } #[inline(always)] fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) } #[inline(always)] fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) } #[inline(always)] fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) } #[inline(always)] fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) } #[inline(always)] fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) } #[inline(always)] fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) } #[inline(always)] fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) } #[inline(always)] fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, _) = self.split_f32x16(a); let (b0, _) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) } #[inline(always)] fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (_, a1) = self.split_f32x16(a); let (_, b1) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) } #[inline(always)] fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) } #[inline(always)] fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) } #[inline(always)] fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) } #[inline(always)] fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.max_precise_f32x8(a0, b0), self.max_precise_f32x8(a1, b1), ) } #[inline(always)] fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) } #[inline(always)] fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.min_precise_f32x8(a0, b0), self.min_precise_f32x8(a1, b1), ) } #[inline(always)] fn madd_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1)) } #[inline(always)] fn msub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1)) } #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) } #[inline(always)] fn fract_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) } #[inline(always)] fn trunc_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) } #[inline(always)] fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) } #[inline(always)] fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { let mut b0 = [0.0; 8usize]; let mut b1 = [0.0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { let (a0, a1) = self.split_f32x16(a); self.combine_f64x4( self.reinterpret_f64_f32x8(a0), self.reinterpret_f64_f32x8(a1), ) } #[inline(always)] fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8( self.reinterpret_i32_f32x8(a0), self.reinterpret_i32_f32x8(a1), ) } #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { unsafe { vld4q_f32(src.as_ptr()).simd_into(self) } } #[inline(always)] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { unsafe { vst4q_f32(dest.as_mut_ptr(), a.into()) } } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) } #[inline(always)] fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8( self.reinterpret_u32_f32x8(a0), self.reinterpret_u32_f32x8(a1), ) } #[inline(always)] fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) } #[inline(always)] fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) } #[inline(always)] fn splat_i8x64(self, a: i8) -> i8x64 { let half = self.splat_i8x32(a); self.combine_i8x32(half, half) } #[inline(always)] fn not_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) } #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) } #[inline(always)] fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) } #[inline(always)] fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) } #[inline(always)] fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) } #[inline(always)] fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) } #[inline(always)] fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) } #[inline(always)] fn shr_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b)) } #[inline(always)] fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) } #[inline(always)] fn shl_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shl_i8x32(a0, b), self.shl_i8x32(a1, b)) } #[inline(always)] fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) } #[inline(always)] fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) } #[inline(always)] fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) } #[inline(always)] fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) } #[inline(always)] fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) } #[inline(always)] fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, _) = self.split_i8x64(a); let (b0, _) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) } #[inline(always)] fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (_, a1) = self.split_i8x64(a); let (_, b1) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) } #[inline(always)] fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) } #[inline(always)] fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) } #[inline(always)] fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_i8x64(b); let (c0, c1) = self.split_i8x64(c); self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) } #[inline(always)] fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) } #[inline(always)] fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) } #[inline(always)] fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) } #[inline(always)] fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) } #[inline(always)] fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { let (a0, a1) = self.split_i8x64(a); self.combine_u32x8( self.reinterpret_u32_i8x32(a0), self.reinterpret_u32_i8x32(a1), ) } #[inline(always)] fn splat_u8x64(self, a: u8) -> u8x64 { let half = self.splat_u8x32(a); self.combine_u8x32(half, half) } #[inline(always)] fn not_u8x64(self, a: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) } #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) } #[inline(always)] fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) } #[inline(always)] fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) } #[inline(always)] fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) } #[inline(always)] fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) } #[inline(always)] fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) } #[inline(always)] fn shr_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b)) } #[inline(always)] fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) } #[inline(always)] fn shl_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shl_u8x32(a0, b), self.shl_u8x32(a1, b)) } #[inline(always)] fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) } #[inline(always)] fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) } #[inline(always)] fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) } #[inline(always)] fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) } #[inline(always)] fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) } #[inline(always)] fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, _) = self.split_u8x64(a); let (b0, _) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) } #[inline(always)] fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (_, a1) = self.split_u8x64(a); let (_, b1) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) } #[inline(always)] fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) } #[inline(always)] fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) } #[inline(always)] fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_u8x64(b); let (c0, c1) = self.split_u8x64(c); self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) } #[inline(always)] fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) } #[inline(always)] fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) } #[inline(always)] fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { unsafe { vld4q_u8(src.as_ptr()).simd_into(self) } } #[inline(always)] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { unsafe { vst4q_u8(dest.as_mut_ptr(), a.into()) } } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { let (a0, a1) = self.split_u8x64(a); self.combine_u32x8( self.reinterpret_u32_u8x32(a0), self.reinterpret_u32_u8x32(a1), ) } #[inline(always)] fn splat_mask8x64(self, a: i8) -> mask8x64 { let half = self.splat_mask8x32(a); self.combine_mask8x32(half, half) } #[inline(always)] fn not_mask8x64(self, a: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) } #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) } #[inline(always)] fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) } #[inline(always)] fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) } #[inline(always)] fn select_mask8x64( self, a: mask8x64, b: mask8x64, c: mask8x64, ) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); let (c0, c1) = self.split_mask8x64(c); self.combine_mask8x32( self.select_mask8x32(a0, b0, c0), self.select_mask8x32(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) } #[inline(always)] fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x32(self, a: i16) -> i16x32 { let half = self.splat_i16x16(a); self.combine_i16x16(half, half) } #[inline(always)] fn not_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) } #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) } #[inline(always)] fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) } #[inline(always)] fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) } #[inline(always)] fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) } #[inline(always)] fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) } #[inline(always)] fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) } #[inline(always)] fn shr_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b)) } #[inline(always)] fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) } #[inline(always)] fn shl_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shl_i16x16(a0, b), self.shl_i16x16(a1, b)) } #[inline(always)] fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) } #[inline(always)] fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) } #[inline(always)] fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) } #[inline(always)] fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) } #[inline(always)] fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) } #[inline(always)] fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, _) = self.split_i16x32(a); let (b0, _) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) } #[inline(always)] fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (_, a1) = self.split_i16x32(a); let (_, b1) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) } #[inline(always)] fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) } #[inline(always)] fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16( self.unzip_high_i16x16(a0, a1), self.unzip_high_i16x16(b0, b1), ) } #[inline(always)] fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_i16x32(b); let (c0, c1) = self.split_i16x32(c); self.combine_i16x16( self.select_i16x16(a0, b0, c0), self.select_i16x16(a1, b1, c1), ) } #[inline(always)] fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) } #[inline(always)] fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) } #[inline(always)] fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) } #[inline(always)] fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { let (a0, a1) = self.split_i16x32(a); self.combine_u8x32( self.reinterpret_u8_i16x16(a0), self.reinterpret_u8_i16x16(a1), ) } #[inline(always)] fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { let (a0, a1) = self.split_i16x32(a); self.combine_u32x8( self.reinterpret_u32_i16x16(a0), self.reinterpret_u32_i16x16(a1), ) } #[inline(always)] fn splat_u16x32(self, a: u16) -> u16x32 { let half = self.splat_u16x16(a); self.combine_u16x16(half, half) } #[inline(always)] fn not_u16x32(self, a: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) } #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) } #[inline(always)] fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) } #[inline(always)] fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) } #[inline(always)] fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) } #[inline(always)] fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) } #[inline(always)] fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) } #[inline(always)] fn shr_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b)) } #[inline(always)] fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) } #[inline(always)] fn shl_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shl_u16x16(a0, b), self.shl_u16x16(a1, b)) } #[inline(always)] fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) } #[inline(always)] fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) } #[inline(always)] fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) } #[inline(always)] fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) } #[inline(always)] fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) } #[inline(always)] fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, _) = self.split_u16x32(a); let (b0, _) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) } #[inline(always)] fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (_, a1) = self.split_u16x32(a); let (_, b1) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) } #[inline(always)] fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) } #[inline(always)] fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16( self.unzip_high_u16x16(a0, a1), self.unzip_high_u16x16(b0, b1), ) } #[inline(always)] fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_u16x32(b); let (c0, c1) = self.split_u16x32(c); self.combine_u16x16( self.select_u16x16(a0, b0, c0), self.select_u16x16(a1, b1, c1), ) } #[inline(always)] fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) } #[inline(always)] fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) } #[inline(always)] fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { unsafe { vld4q_u16(src.as_ptr()).simd_into(self) } } #[inline(always)] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { unsafe { vst4q_u16(dest.as_mut_ptr(), a.into()) } } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) } #[inline(always)] fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x32( self.reinterpret_u8_u16x16(a0), self.reinterpret_u8_u16x16(a1), ) } #[inline(always)] fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { let (a0, a1) = self.split_u16x32(a); self.combine_u32x8( self.reinterpret_u32_u16x16(a0), self.reinterpret_u32_u16x16(a1), ) } #[inline(always)] fn splat_mask16x32(self, a: i16) -> mask16x32 { let half = self.splat_mask16x16(a); self.combine_mask16x16(half, half) } #[inline(always)] fn not_mask16x32(self, a: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) } #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) } #[inline(always)] fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) } #[inline(always)] fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) } #[inline(always)] fn select_mask16x32( self, a: mask16x32, b: mask16x32, c: mask16x32, ) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); let (c0, c1) = self.split_mask16x32(c); self.combine_mask16x16( self.select_mask16x16(a0, b0, c0), self.select_mask16x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16( self.simd_eq_mask16x16(a0, b0), self.simd_eq_mask16x16(a1, b1), ) } #[inline(always)] fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x16(self, a: i32) -> i32x16 { let half = self.splat_i32x8(a); self.combine_i32x8(half, half) } #[inline(always)] fn not_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) } #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) } #[inline(always)] fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) } #[inline(always)] fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) } #[inline(always)] fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) } #[inline(always)] fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) } #[inline(always)] fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) } #[inline(always)] fn shr_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b)) } #[inline(always)] fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) } #[inline(always)] fn shl_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shl_i32x8(a0, b), self.shl_i32x8(a1, b)) } #[inline(always)] fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) } #[inline(always)] fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) } #[inline(always)] fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) } #[inline(always)] fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) } #[inline(always)] fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) } #[inline(always)] fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, _) = self.split_i32x16(a); let (b0, _) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) } #[inline(always)] fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (_, a1) = self.split_i32x16(a); let (_, b1) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) } #[inline(always)] fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) } #[inline(always)] fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) } #[inline(always)] fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_i32x16(b); let (c0, c1) = self.split_i32x16(c); self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) } #[inline(always)] fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) } #[inline(always)] fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) } #[inline(always)] fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) } #[inline(always)] fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { let (a0, a1) = self.split_i32x16(a); self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) } #[inline(always)] fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_u32x8( self.reinterpret_u32_i32x8(a0), self.reinterpret_u32_i32x8(a1), ) } #[inline(always)] fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) } #[inline(always)] fn splat_u32x16(self, a: u32) -> u32x16 { let half = self.splat_u32x8(a); self.combine_u32x8(half, half) } #[inline(always)] fn not_u32x16(self, a: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) } #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) } #[inline(always)] fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) } #[inline(always)] fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) } #[inline(always)] fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) } #[inline(always)] fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) } #[inline(always)] fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) } #[inline(always)] fn shr_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b)) } #[inline(always)] fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) } #[inline(always)] fn shl_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shl_u32x8(a0, b), self.shl_u32x8(a1, b)) } #[inline(always)] fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) } #[inline(always)] fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) } #[inline(always)] fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) } #[inline(always)] fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) } #[inline(always)] fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) } #[inline(always)] fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, _) = self.split_u32x16(a); let (b0, _) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) } #[inline(always)] fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (_, a1) = self.split_u32x16(a); let (_, b1) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) } #[inline(always)] fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) } #[inline(always)] fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) } #[inline(always)] fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_u32x16(b); let (c0, c1) = self.split_u32x16(c); self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) } #[inline(always)] fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) } #[inline(always)] fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) } #[inline(always)] fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { unsafe { vld4q_u32(src.as_ptr()).simd_into(self) } } #[inline(always)] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { unsafe { vst4q_u32(dest.as_mut_ptr(), a.into()) } } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { let (a0, a1) = self.split_u32x16(a); self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) } #[inline(always)] fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) } #[inline(always)] fn splat_mask32x16(self, a: i32) -> mask32x16 { let half = self.splat_mask32x8(a); self.combine_mask32x8(half, half) } #[inline(always)] fn not_mask32x16(self, a: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) } #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) } #[inline(always)] fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) } #[inline(always)] fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) } #[inline(always)] fn select_mask32x16( self, a: mask32x16, b: mask32x16, c: mask32x16, ) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); let (c0, c1) = self.split_mask32x16(c); self.combine_mask32x8( self.select_mask32x8(a0, b0, c0), self.select_mask32x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) } #[inline(always)] fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x8(self, a: f64) -> f64x8 { let half = self.splat_f64x4(a); self.combine_f64x4(half, half) } #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) } #[inline(always)] fn neg_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) } #[inline(always)] fn sqrt_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) } #[inline(always)] fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) } #[inline(always)] fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) } #[inline(always)] fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) } #[inline(always)] fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) } #[inline(always)] fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) } #[inline(always)] fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) } #[inline(always)] fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) } #[inline(always)] fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) } #[inline(always)] fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) } #[inline(always)] fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, _) = self.split_f64x8(a); let (b0, _) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) } #[inline(always)] fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (_, a1) = self.split_f64x8(a); let (_, b1) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) } #[inline(always)] fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) } #[inline(always)] fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) } #[inline(always)] fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) } #[inline(always)] fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.max_precise_f64x4(a0, b0), self.max_precise_f64x4(a1, b1), ) } #[inline(always)] fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) } #[inline(always)] fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.min_precise_f64x4(a0, b0), self.min_precise_f64x4(a1, b1), ) } #[inline(always)] fn madd_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1)) } #[inline(always)] fn msub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1)) } #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) } #[inline(always)] fn fract_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) } #[inline(always)] fn trunc_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) } #[inline(always)] fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) } #[inline(always)] fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( self.reinterpret_f32_f64x4(a0), self.reinterpret_f32_f64x4(a1), ) } #[inline(always)] fn splat_mask64x8(self, a: i64) -> mask64x8 { let half = self.splat_mask64x4(a); self.combine_mask64x4(half, half) } #[inline(always)] fn not_mask64x8(self, a: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1)) } #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1)) } #[inline(always)] fn or_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1)) } #[inline(always)] fn xor_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1)) } #[inline(always)] fn select_mask64x8( self, a: mask64x8, b: mask64x8, c: mask64x8, ) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); let (c0, c1) = self.split_mask64x8(c); self.combine_mask64x4( self.select_mask64x4(a0, b0, c0), self.select_mask64x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1)) } #[inline(always)] fn split_mask64x8(self, a: mask64x8) -> (mask64x4, mask64x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } } impl SimdFrom for f32x4 { #[inline(always)] fn simd_from(arch: float32x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for float32x4_t { #[inline(always)] fn from(value: f32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i8x16 { #[inline(always)] fn simd_from(arch: int8x16_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int8x16_t { #[inline(always)] fn from(value: i8x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u8x16 { #[inline(always)] fn simd_from(arch: uint8x16_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for uint8x16_t { #[inline(always)] fn from(value: u8x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask8x16 { #[inline(always)] fn simd_from(arch: int8x16_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int8x16_t { #[inline(always)] fn from(value: mask8x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i16x8 { #[inline(always)] fn simd_from(arch: int16x8_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int16x8_t { #[inline(always)] fn from(value: i16x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u16x8 { #[inline(always)] fn simd_from(arch: uint16x8_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for uint16x8_t { #[inline(always)] fn from(value: u16x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask16x8 { #[inline(always)] fn simd_from(arch: int16x8_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int16x8_t { #[inline(always)] fn from(value: mask16x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i32x4 { #[inline(always)] fn simd_from(arch: int32x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int32x4_t { #[inline(always)] fn from(value: i32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u32x4 { #[inline(always)] fn simd_from(arch: uint32x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for uint32x4_t { #[inline(always)] fn from(value: u32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask32x4 { #[inline(always)] fn simd_from(arch: int32x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int32x4_t { #[inline(always)] fn from(value: mask32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for f64x2 { #[inline(always)] fn simd_from(arch: float64x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for float64x2_t { #[inline(always)] fn from(value: f64x2) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask64x2 { #[inline(always)] fn simd_from(arch: int64x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int64x2_t { #[inline(always)] fn from(value: mask64x2) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for f32x8 { #[inline(always)] fn simd_from(arch: float32x4x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for float32x4x2_t { #[inline(always)] fn from(value: f32x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i8x32 { #[inline(always)] fn simd_from(arch: int8x16x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int8x16x2_t { #[inline(always)] fn from(value: i8x32) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u8x32 { #[inline(always)] fn simd_from(arch: uint8x16x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for uint8x16x2_t { #[inline(always)] fn from(value: u8x32) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask8x32 { #[inline(always)] fn simd_from(arch: int8x16x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int8x16x2_t { #[inline(always)] fn from(value: mask8x32) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i16x16 { #[inline(always)] fn simd_from(arch: int16x8x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int16x8x2_t { #[inline(always)] fn from(value: i16x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u16x16 { #[inline(always)] fn simd_from(arch: uint16x8x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for uint16x8x2_t { #[inline(always)] fn from(value: u16x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask16x16 { #[inline(always)] fn simd_from(arch: int16x8x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int16x8x2_t { #[inline(always)] fn from(value: mask16x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i32x8 { #[inline(always)] fn simd_from(arch: int32x4x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int32x4x2_t { #[inline(always)] fn from(value: i32x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u32x8 { #[inline(always)] fn simd_from(arch: uint32x4x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for uint32x4x2_t { #[inline(always)] fn from(value: u32x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask32x8 { #[inline(always)] fn simd_from(arch: int32x4x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int32x4x2_t { #[inline(always)] fn from(value: mask32x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for f64x4 { #[inline(always)] fn simd_from(arch: float64x2x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for float64x2x2_t { #[inline(always)] fn from(value: f64x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask64x4 { #[inline(always)] fn simd_from(arch: int64x2x2_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int64x2x2_t { #[inline(always)] fn from(value: mask64x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for f32x16 { #[inline(always)] fn simd_from(arch: float32x4x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for float32x4x4_t { #[inline(always)] fn from(value: f32x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i8x64 { #[inline(always)] fn simd_from(arch: int8x16x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int8x16x4_t { #[inline(always)] fn from(value: i8x64) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u8x64 { #[inline(always)] fn simd_from(arch: uint8x16x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for uint8x16x4_t { #[inline(always)] fn from(value: u8x64) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask8x64 { #[inline(always)] fn simd_from(arch: int8x16x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int8x16x4_t { #[inline(always)] fn from(value: mask8x64) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i16x32 { #[inline(always)] fn simd_from(arch: int16x8x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int16x8x4_t { #[inline(always)] fn from(value: i16x32) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u16x32 { #[inline(always)] fn simd_from(arch: uint16x8x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for uint16x8x4_t { #[inline(always)] fn from(value: u16x32) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask16x32 { #[inline(always)] fn simd_from(arch: int16x8x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int16x8x4_t { #[inline(always)] fn from(value: mask16x32) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i32x16 { #[inline(always)] fn simd_from(arch: int32x4x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int32x4x4_t { #[inline(always)] fn from(value: i32x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u32x16 { #[inline(always)] fn simd_from(arch: uint32x4x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for uint32x4x4_t { #[inline(always)] fn from(value: u32x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask32x16 { #[inline(always)] fn simd_from(arch: int32x4x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int32x4x4_t { #[inline(always)] fn from(value: mask32x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for f64x8 { #[inline(always)] fn simd_from(arch: float64x2x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for float64x2x4_t { #[inline(always)] fn from(value: f64x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask64x8 { #[inline(always)] fn simd_from(arch: int64x2x4_t, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for int64x2x4_t { #[inline(always)] fn from(value: mask64x8) -> Self { unsafe { core::mem::transmute(value.val) } } } fearless_simd-0.3.0/src/generated/ops.rs000064400000000000000000005667561046102023000163710ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT // This file is autogenerated by fearless_simd_gen use crate::{Simd, SimdInto}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, }; impl core::ops::Neg for f32x4 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_f32x4(self) } } impl core::ops::Add for f32x4 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_f32x4(self, rhs) } } impl core::ops::AddAssign for f32x4 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_f32x4(*self, rhs); } } impl core::ops::Add for f32x4 { type Output = Self; #[inline(always)] fn add(self, rhs: f32) -> Self::Output { self.simd.add_f32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for f32x4 { #[inline(always)] fn add_assign(&mut self, rhs: f32) { *self = self.simd.add_f32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for f32 { type Output = f32x4; #[inline(always)] fn add(self, rhs: f32x4) -> Self::Output { rhs.simd.add_f32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for f32x4 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_f32x4(self, rhs) } } impl core::ops::SubAssign for f32x4 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_f32x4(*self, rhs); } } impl core::ops::Sub for f32x4 { type Output = Self; #[inline(always)] fn sub(self, rhs: f32) -> Self::Output { self.simd.sub_f32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for f32x4 { #[inline(always)] fn sub_assign(&mut self, rhs: f32) { *self = self.simd.sub_f32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for f32 { type Output = f32x4; #[inline(always)] fn sub(self, rhs: f32x4) -> Self::Output { rhs.simd.sub_f32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for f32x4 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_f32x4(self, rhs) } } impl core::ops::MulAssign for f32x4 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_f32x4(*self, rhs); } } impl core::ops::Mul for f32x4 { type Output = Self; #[inline(always)] fn mul(self, rhs: f32) -> Self::Output { self.simd.mul_f32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for f32x4 { #[inline(always)] fn mul_assign(&mut self, rhs: f32) { *self = self.simd.mul_f32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for f32 { type Output = f32x4; #[inline(always)] fn mul(self, rhs: f32x4) -> Self::Output { rhs.simd.mul_f32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Div for f32x4 { type Output = Self; #[inline(always)] fn div(self, rhs: Self) -> Self::Output { self.simd.div_f32x4(self, rhs) } } impl core::ops::DivAssign for f32x4 { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = self.simd.div_f32x4(*self, rhs); } } impl core::ops::Div for f32x4 { type Output = Self; #[inline(always)] fn div(self, rhs: f32) -> Self::Output { self.simd.div_f32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::DivAssign for f32x4 { #[inline(always)] fn div_assign(&mut self, rhs: f32) { *self = self.simd.div_f32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Div> for f32 { type Output = f32x4; #[inline(always)] fn div(self, rhs: f32x4) -> Self::Output { rhs.simd.div_f32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Neg for i8x16 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_i8x16(self) } } impl core::ops::Add for i8x16 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_i8x16(self, rhs) } } impl core::ops::AddAssign for i8x16 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_i8x16(*self, rhs); } } impl core::ops::Add for i8x16 { type Output = Self; #[inline(always)] fn add(self, rhs: i8) -> Self::Output { self.simd.add_i8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for i8x16 { #[inline(always)] fn add_assign(&mut self, rhs: i8) { *self = self.simd.add_i8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for i8 { type Output = i8x16; #[inline(always)] fn add(self, rhs: i8x16) -> Self::Output { rhs.simd.add_i8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for i8x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_i8x16(self, rhs) } } impl core::ops::SubAssign for i8x16 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_i8x16(*self, rhs); } } impl core::ops::Sub for i8x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: i8) -> Self::Output { self.simd.sub_i8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for i8x16 { #[inline(always)] fn sub_assign(&mut self, rhs: i8) { *self = self.simd.sub_i8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for i8 { type Output = i8x16; #[inline(always)] fn sub(self, rhs: i8x16) -> Self::Output { rhs.simd.sub_i8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for i8x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_i8x16(self, rhs) } } impl core::ops::MulAssign for i8x16 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_i8x16(*self, rhs); } } impl core::ops::Mul for i8x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: i8) -> Self::Output { self.simd.mul_i8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for i8x16 { #[inline(always)] fn mul_assign(&mut self, rhs: i8) { *self = self.simd.mul_i8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for i8 { type Output = i8x16; #[inline(always)] fn mul(self, rhs: i8x16) -> Self::Output { rhs.simd.mul_i8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for i8x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_i8x16(self, rhs) } } impl core::ops::BitAndAssign for i8x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_i8x16(*self, rhs); } } impl core::ops::BitAnd for i8x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i8) -> Self::Output { self.simd.and_i8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for i8x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: i8) { *self = self.simd.and_i8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i8 { type Output = i8x16; #[inline(always)] fn bitand(self, rhs: i8x16) -> Self::Output { rhs.simd.and_i8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for i8x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_i8x16(self, rhs) } } impl core::ops::BitOrAssign for i8x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_i8x16(*self, rhs); } } impl core::ops::BitOr for i8x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i8) -> Self::Output { self.simd.or_i8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for i8x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: i8) { *self = self.simd.or_i8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i8 { type Output = i8x16; #[inline(always)] fn bitor(self, rhs: i8x16) -> Self::Output { rhs.simd.or_i8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for i8x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_i8x16(self, rhs) } } impl core::ops::BitXorAssign for i8x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_i8x16(*self, rhs); } } impl core::ops::BitXor for i8x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i8) -> Self::Output { self.simd.xor_i8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for i8x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i8) { *self = self.simd.xor_i8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i8 { type Output = i8x16; #[inline(always)] fn bitxor(self, rhs: i8x16) -> Self::Output { rhs.simd.xor_i8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for i8x16 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_i8x16(self, rhs) } } impl core::ops::Shr for i8x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_i8x16(self, rhs) } } impl core::ops::ShlAssign for i8x16 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_i8x16(*self, rhs); } } impl core::ops::ShrAssign for i8x16 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_i8x16(*self, rhs); } } impl core::ops::Shr for i8x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_i8x16(self, rhs) } } impl core::ops::ShrAssign for i8x16 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_i8x16(*self, rhs); } } impl core::ops::Add for u8x16 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_u8x16(self, rhs) } } impl core::ops::AddAssign for u8x16 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_u8x16(*self, rhs); } } impl core::ops::Add for u8x16 { type Output = Self; #[inline(always)] fn add(self, rhs: u8) -> Self::Output { self.simd.add_u8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for u8x16 { #[inline(always)] fn add_assign(&mut self, rhs: u8) { *self = self.simd.add_u8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for u8 { type Output = u8x16; #[inline(always)] fn add(self, rhs: u8x16) -> Self::Output { rhs.simd.add_u8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for u8x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_u8x16(self, rhs) } } impl core::ops::SubAssign for u8x16 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_u8x16(*self, rhs); } } impl core::ops::Sub for u8x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: u8) -> Self::Output { self.simd.sub_u8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for u8x16 { #[inline(always)] fn sub_assign(&mut self, rhs: u8) { *self = self.simd.sub_u8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for u8 { type Output = u8x16; #[inline(always)] fn sub(self, rhs: u8x16) -> Self::Output { rhs.simd.sub_u8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for u8x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_u8x16(self, rhs) } } impl core::ops::MulAssign for u8x16 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_u8x16(*self, rhs); } } impl core::ops::Mul for u8x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: u8) -> Self::Output { self.simd.mul_u8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for u8x16 { #[inline(always)] fn mul_assign(&mut self, rhs: u8) { *self = self.simd.mul_u8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for u8 { type Output = u8x16; #[inline(always)] fn mul(self, rhs: u8x16) -> Self::Output { rhs.simd.mul_u8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for u8x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_u8x16(self, rhs) } } impl core::ops::BitAndAssign for u8x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_u8x16(*self, rhs); } } impl core::ops::BitAnd for u8x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: u8) -> Self::Output { self.simd.and_u8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for u8x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: u8) { *self = self.simd.and_u8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for u8 { type Output = u8x16; #[inline(always)] fn bitand(self, rhs: u8x16) -> Self::Output { rhs.simd.and_u8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for u8x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_u8x16(self, rhs) } } impl core::ops::BitOrAssign for u8x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_u8x16(*self, rhs); } } impl core::ops::BitOr for u8x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: u8) -> Self::Output { self.simd.or_u8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for u8x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: u8) { *self = self.simd.or_u8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for u8 { type Output = u8x16; #[inline(always)] fn bitor(self, rhs: u8x16) -> Self::Output { rhs.simd.or_u8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for u8x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_u8x16(self, rhs) } } impl core::ops::BitXorAssign for u8x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_u8x16(*self, rhs); } } impl core::ops::BitXor for u8x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: u8) -> Self::Output { self.simd.xor_u8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for u8x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: u8) { *self = self.simd.xor_u8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for u8 { type Output = u8x16; #[inline(always)] fn bitxor(self, rhs: u8x16) -> Self::Output { rhs.simd.xor_u8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for u8x16 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_u8x16(self, rhs) } } impl core::ops::Shr for u8x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_u8x16(self, rhs) } } impl core::ops::ShlAssign for u8x16 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_u8x16(*self, rhs); } } impl core::ops::ShrAssign for u8x16 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_u8x16(*self, rhs); } } impl core::ops::Shr for u8x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_u8x16(self, rhs) } } impl core::ops::ShrAssign for u8x16 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_u8x16(*self, rhs); } } impl core::ops::BitAnd for mask8x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask8x16(self, rhs) } } impl core::ops::BitAndAssign for mask8x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask8x16(*self, rhs); } } impl core::ops::BitAnd for mask8x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i8) -> Self::Output { self.simd.and_mask8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask8x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: i8) { *self = self.simd.and_mask8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i8 { type Output = mask8x16; #[inline(always)] fn bitand(self, rhs: mask8x16) -> Self::Output { rhs.simd.and_mask8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask8x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask8x16(self, rhs) } } impl core::ops::BitOrAssign for mask8x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask8x16(*self, rhs); } } impl core::ops::BitOr for mask8x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i8) -> Self::Output { self.simd.or_mask8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask8x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: i8) { *self = self.simd.or_mask8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i8 { type Output = mask8x16; #[inline(always)] fn bitor(self, rhs: mask8x16) -> Self::Output { rhs.simd.or_mask8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask8x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask8x16(self, rhs) } } impl core::ops::BitXorAssign for mask8x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask8x16(*self, rhs); } } impl core::ops::BitXor for mask8x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i8) -> Self::Output { self.simd.xor_mask8x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask8x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i8) { *self = self.simd.xor_mask8x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i8 { type Output = mask8x16; #[inline(always)] fn bitxor(self, rhs: mask8x16) -> Self::Output { rhs.simd.xor_mask8x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask8x16 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask8x16(self) } } impl core::ops::Neg for i16x8 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_i16x8(self) } } impl core::ops::Add for i16x8 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_i16x8(self, rhs) } } impl core::ops::AddAssign for i16x8 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_i16x8(*self, rhs); } } impl core::ops::Add for i16x8 { type Output = Self; #[inline(always)] fn add(self, rhs: i16) -> Self::Output { self.simd.add_i16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for i16x8 { #[inline(always)] fn add_assign(&mut self, rhs: i16) { *self = self.simd.add_i16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for i16 { type Output = i16x8; #[inline(always)] fn add(self, rhs: i16x8) -> Self::Output { rhs.simd.add_i16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for i16x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_i16x8(self, rhs) } } impl core::ops::SubAssign for i16x8 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_i16x8(*self, rhs); } } impl core::ops::Sub for i16x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: i16) -> Self::Output { self.simd.sub_i16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for i16x8 { #[inline(always)] fn sub_assign(&mut self, rhs: i16) { *self = self.simd.sub_i16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for i16 { type Output = i16x8; #[inline(always)] fn sub(self, rhs: i16x8) -> Self::Output { rhs.simd.sub_i16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for i16x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_i16x8(self, rhs) } } impl core::ops::MulAssign for i16x8 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_i16x8(*self, rhs); } } impl core::ops::Mul for i16x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: i16) -> Self::Output { self.simd.mul_i16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for i16x8 { #[inline(always)] fn mul_assign(&mut self, rhs: i16) { *self = self.simd.mul_i16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for i16 { type Output = i16x8; #[inline(always)] fn mul(self, rhs: i16x8) -> Self::Output { rhs.simd.mul_i16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for i16x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_i16x8(self, rhs) } } impl core::ops::BitAndAssign for i16x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_i16x8(*self, rhs); } } impl core::ops::BitAnd for i16x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i16) -> Self::Output { self.simd.and_i16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for i16x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: i16) { *self = self.simd.and_i16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i16 { type Output = i16x8; #[inline(always)] fn bitand(self, rhs: i16x8) -> Self::Output { rhs.simd.and_i16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for i16x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_i16x8(self, rhs) } } impl core::ops::BitOrAssign for i16x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_i16x8(*self, rhs); } } impl core::ops::BitOr for i16x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i16) -> Self::Output { self.simd.or_i16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for i16x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: i16) { *self = self.simd.or_i16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i16 { type Output = i16x8; #[inline(always)] fn bitor(self, rhs: i16x8) -> Self::Output { rhs.simd.or_i16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for i16x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_i16x8(self, rhs) } } impl core::ops::BitXorAssign for i16x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_i16x8(*self, rhs); } } impl core::ops::BitXor for i16x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i16) -> Self::Output { self.simd.xor_i16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for i16x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i16) { *self = self.simd.xor_i16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i16 { type Output = i16x8; #[inline(always)] fn bitxor(self, rhs: i16x8) -> Self::Output { rhs.simd.xor_i16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for i16x8 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_i16x8(self, rhs) } } impl core::ops::Shr for i16x8 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_i16x8(self, rhs) } } impl core::ops::ShlAssign for i16x8 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_i16x8(*self, rhs); } } impl core::ops::ShrAssign for i16x8 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_i16x8(*self, rhs); } } impl core::ops::Shr for i16x8 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_i16x8(self, rhs) } } impl core::ops::ShrAssign for i16x8 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_i16x8(*self, rhs); } } impl core::ops::Add for u16x8 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_u16x8(self, rhs) } } impl core::ops::AddAssign for u16x8 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_u16x8(*self, rhs); } } impl core::ops::Add for u16x8 { type Output = Self; #[inline(always)] fn add(self, rhs: u16) -> Self::Output { self.simd.add_u16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for u16x8 { #[inline(always)] fn add_assign(&mut self, rhs: u16) { *self = self.simd.add_u16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for u16 { type Output = u16x8; #[inline(always)] fn add(self, rhs: u16x8) -> Self::Output { rhs.simd.add_u16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for u16x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_u16x8(self, rhs) } } impl core::ops::SubAssign for u16x8 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_u16x8(*self, rhs); } } impl core::ops::Sub for u16x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: u16) -> Self::Output { self.simd.sub_u16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for u16x8 { #[inline(always)] fn sub_assign(&mut self, rhs: u16) { *self = self.simd.sub_u16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for u16 { type Output = u16x8; #[inline(always)] fn sub(self, rhs: u16x8) -> Self::Output { rhs.simd.sub_u16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for u16x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_u16x8(self, rhs) } } impl core::ops::MulAssign for u16x8 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_u16x8(*self, rhs); } } impl core::ops::Mul for u16x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: u16) -> Self::Output { self.simd.mul_u16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for u16x8 { #[inline(always)] fn mul_assign(&mut self, rhs: u16) { *self = self.simd.mul_u16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for u16 { type Output = u16x8; #[inline(always)] fn mul(self, rhs: u16x8) -> Self::Output { rhs.simd.mul_u16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for u16x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_u16x8(self, rhs) } } impl core::ops::BitAndAssign for u16x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_u16x8(*self, rhs); } } impl core::ops::BitAnd for u16x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: u16) -> Self::Output { self.simd.and_u16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for u16x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: u16) { *self = self.simd.and_u16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for u16 { type Output = u16x8; #[inline(always)] fn bitand(self, rhs: u16x8) -> Self::Output { rhs.simd.and_u16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for u16x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_u16x8(self, rhs) } } impl core::ops::BitOrAssign for u16x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_u16x8(*self, rhs); } } impl core::ops::BitOr for u16x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: u16) -> Self::Output { self.simd.or_u16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for u16x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: u16) { *self = self.simd.or_u16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for u16 { type Output = u16x8; #[inline(always)] fn bitor(self, rhs: u16x8) -> Self::Output { rhs.simd.or_u16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for u16x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_u16x8(self, rhs) } } impl core::ops::BitXorAssign for u16x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_u16x8(*self, rhs); } } impl core::ops::BitXor for u16x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: u16) -> Self::Output { self.simd.xor_u16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for u16x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: u16) { *self = self.simd.xor_u16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for u16 { type Output = u16x8; #[inline(always)] fn bitxor(self, rhs: u16x8) -> Self::Output { rhs.simd.xor_u16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for u16x8 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_u16x8(self, rhs) } } impl core::ops::Shr for u16x8 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_u16x8(self, rhs) } } impl core::ops::ShlAssign for u16x8 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_u16x8(*self, rhs); } } impl core::ops::ShrAssign for u16x8 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_u16x8(*self, rhs); } } impl core::ops::Shr for u16x8 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_u16x8(self, rhs) } } impl core::ops::ShrAssign for u16x8 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_u16x8(*self, rhs); } } impl core::ops::BitAnd for mask16x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask16x8(self, rhs) } } impl core::ops::BitAndAssign for mask16x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask16x8(*self, rhs); } } impl core::ops::BitAnd for mask16x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i16) -> Self::Output { self.simd.and_mask16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask16x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: i16) { *self = self.simd.and_mask16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i16 { type Output = mask16x8; #[inline(always)] fn bitand(self, rhs: mask16x8) -> Self::Output { rhs.simd.and_mask16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask16x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask16x8(self, rhs) } } impl core::ops::BitOrAssign for mask16x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask16x8(*self, rhs); } } impl core::ops::BitOr for mask16x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i16) -> Self::Output { self.simd.or_mask16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask16x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: i16) { *self = self.simd.or_mask16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i16 { type Output = mask16x8; #[inline(always)] fn bitor(self, rhs: mask16x8) -> Self::Output { rhs.simd.or_mask16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask16x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask16x8(self, rhs) } } impl core::ops::BitXorAssign for mask16x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask16x8(*self, rhs); } } impl core::ops::BitXor for mask16x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i16) -> Self::Output { self.simd.xor_mask16x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask16x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i16) { *self = self.simd.xor_mask16x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i16 { type Output = mask16x8; #[inline(always)] fn bitxor(self, rhs: mask16x8) -> Self::Output { rhs.simd.xor_mask16x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask16x8 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask16x8(self) } } impl core::ops::Neg for i32x4 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_i32x4(self) } } impl core::ops::Add for i32x4 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_i32x4(self, rhs) } } impl core::ops::AddAssign for i32x4 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_i32x4(*self, rhs); } } impl core::ops::Add for i32x4 { type Output = Self; #[inline(always)] fn add(self, rhs: i32) -> Self::Output { self.simd.add_i32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for i32x4 { #[inline(always)] fn add_assign(&mut self, rhs: i32) { *self = self.simd.add_i32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for i32 { type Output = i32x4; #[inline(always)] fn add(self, rhs: i32x4) -> Self::Output { rhs.simd.add_i32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for i32x4 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_i32x4(self, rhs) } } impl core::ops::SubAssign for i32x4 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_i32x4(*self, rhs); } } impl core::ops::Sub for i32x4 { type Output = Self; #[inline(always)] fn sub(self, rhs: i32) -> Self::Output { self.simd.sub_i32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for i32x4 { #[inline(always)] fn sub_assign(&mut self, rhs: i32) { *self = self.simd.sub_i32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for i32 { type Output = i32x4; #[inline(always)] fn sub(self, rhs: i32x4) -> Self::Output { rhs.simd.sub_i32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for i32x4 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_i32x4(self, rhs) } } impl core::ops::MulAssign for i32x4 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_i32x4(*self, rhs); } } impl core::ops::Mul for i32x4 { type Output = Self; #[inline(always)] fn mul(self, rhs: i32) -> Self::Output { self.simd.mul_i32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for i32x4 { #[inline(always)] fn mul_assign(&mut self, rhs: i32) { *self = self.simd.mul_i32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for i32 { type Output = i32x4; #[inline(always)] fn mul(self, rhs: i32x4) -> Self::Output { rhs.simd.mul_i32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for i32x4 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_i32x4(self, rhs) } } impl core::ops::BitAndAssign for i32x4 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_i32x4(*self, rhs); } } impl core::ops::BitAnd for i32x4 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i32) -> Self::Output { self.simd.and_i32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for i32x4 { #[inline(always)] fn bitand_assign(&mut self, rhs: i32) { *self = self.simd.and_i32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i32 { type Output = i32x4; #[inline(always)] fn bitand(self, rhs: i32x4) -> Self::Output { rhs.simd.and_i32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for i32x4 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_i32x4(self, rhs) } } impl core::ops::BitOrAssign for i32x4 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_i32x4(*self, rhs); } } impl core::ops::BitOr for i32x4 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i32) -> Self::Output { self.simd.or_i32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for i32x4 { #[inline(always)] fn bitor_assign(&mut self, rhs: i32) { *self = self.simd.or_i32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i32 { type Output = i32x4; #[inline(always)] fn bitor(self, rhs: i32x4) -> Self::Output { rhs.simd.or_i32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for i32x4 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_i32x4(self, rhs) } } impl core::ops::BitXorAssign for i32x4 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_i32x4(*self, rhs); } } impl core::ops::BitXor for i32x4 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i32) -> Self::Output { self.simd.xor_i32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for i32x4 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i32) { *self = self.simd.xor_i32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i32 { type Output = i32x4; #[inline(always)] fn bitxor(self, rhs: i32x4) -> Self::Output { rhs.simd.xor_i32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for i32x4 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_i32x4(self, rhs) } } impl core::ops::Shr for i32x4 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_i32x4(self, rhs) } } impl core::ops::ShlAssign for i32x4 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_i32x4(*self, rhs); } } impl core::ops::ShrAssign for i32x4 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_i32x4(*self, rhs); } } impl core::ops::Shr for i32x4 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_i32x4(self, rhs) } } impl core::ops::ShrAssign for i32x4 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_i32x4(*self, rhs); } } impl core::ops::Add for u32x4 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_u32x4(self, rhs) } } impl core::ops::AddAssign for u32x4 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_u32x4(*self, rhs); } } impl core::ops::Add for u32x4 { type Output = Self; #[inline(always)] fn add(self, rhs: u32) -> Self::Output { self.simd.add_u32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for u32x4 { #[inline(always)] fn add_assign(&mut self, rhs: u32) { *self = self.simd.add_u32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for u32 { type Output = u32x4; #[inline(always)] fn add(self, rhs: u32x4) -> Self::Output { rhs.simd.add_u32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for u32x4 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_u32x4(self, rhs) } } impl core::ops::SubAssign for u32x4 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_u32x4(*self, rhs); } } impl core::ops::Sub for u32x4 { type Output = Self; #[inline(always)] fn sub(self, rhs: u32) -> Self::Output { self.simd.sub_u32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for u32x4 { #[inline(always)] fn sub_assign(&mut self, rhs: u32) { *self = self.simd.sub_u32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for u32 { type Output = u32x4; #[inline(always)] fn sub(self, rhs: u32x4) -> Self::Output { rhs.simd.sub_u32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for u32x4 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_u32x4(self, rhs) } } impl core::ops::MulAssign for u32x4 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_u32x4(*self, rhs); } } impl core::ops::Mul for u32x4 { type Output = Self; #[inline(always)] fn mul(self, rhs: u32) -> Self::Output { self.simd.mul_u32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for u32x4 { #[inline(always)] fn mul_assign(&mut self, rhs: u32) { *self = self.simd.mul_u32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for u32 { type Output = u32x4; #[inline(always)] fn mul(self, rhs: u32x4) -> Self::Output { rhs.simd.mul_u32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for u32x4 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_u32x4(self, rhs) } } impl core::ops::BitAndAssign for u32x4 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_u32x4(*self, rhs); } } impl core::ops::BitAnd for u32x4 { type Output = Self; #[inline(always)] fn bitand(self, rhs: u32) -> Self::Output { self.simd.and_u32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for u32x4 { #[inline(always)] fn bitand_assign(&mut self, rhs: u32) { *self = self.simd.and_u32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for u32 { type Output = u32x4; #[inline(always)] fn bitand(self, rhs: u32x4) -> Self::Output { rhs.simd.and_u32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for u32x4 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_u32x4(self, rhs) } } impl core::ops::BitOrAssign for u32x4 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_u32x4(*self, rhs); } } impl core::ops::BitOr for u32x4 { type Output = Self; #[inline(always)] fn bitor(self, rhs: u32) -> Self::Output { self.simd.or_u32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for u32x4 { #[inline(always)] fn bitor_assign(&mut self, rhs: u32) { *self = self.simd.or_u32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for u32 { type Output = u32x4; #[inline(always)] fn bitor(self, rhs: u32x4) -> Self::Output { rhs.simd.or_u32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for u32x4 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_u32x4(self, rhs) } } impl core::ops::BitXorAssign for u32x4 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_u32x4(*self, rhs); } } impl core::ops::BitXor for u32x4 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: u32) -> Self::Output { self.simd.xor_u32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for u32x4 { #[inline(always)] fn bitxor_assign(&mut self, rhs: u32) { *self = self.simd.xor_u32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for u32 { type Output = u32x4; #[inline(always)] fn bitxor(self, rhs: u32x4) -> Self::Output { rhs.simd.xor_u32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for u32x4 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_u32x4(self, rhs) } } impl core::ops::Shr for u32x4 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_u32x4(self, rhs) } } impl core::ops::ShlAssign for u32x4 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_u32x4(*self, rhs); } } impl core::ops::ShrAssign for u32x4 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_u32x4(*self, rhs); } } impl core::ops::Shr for u32x4 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_u32x4(self, rhs) } } impl core::ops::ShrAssign for u32x4 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_u32x4(*self, rhs); } } impl core::ops::BitAnd for mask32x4 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask32x4(self, rhs) } } impl core::ops::BitAndAssign for mask32x4 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask32x4(*self, rhs); } } impl core::ops::BitAnd for mask32x4 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i32) -> Self::Output { self.simd.and_mask32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask32x4 { #[inline(always)] fn bitand_assign(&mut self, rhs: i32) { *self = self.simd.and_mask32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i32 { type Output = mask32x4; #[inline(always)] fn bitand(self, rhs: mask32x4) -> Self::Output { rhs.simd.and_mask32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask32x4 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask32x4(self, rhs) } } impl core::ops::BitOrAssign for mask32x4 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask32x4(*self, rhs); } } impl core::ops::BitOr for mask32x4 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i32) -> Self::Output { self.simd.or_mask32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask32x4 { #[inline(always)] fn bitor_assign(&mut self, rhs: i32) { *self = self.simd.or_mask32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i32 { type Output = mask32x4; #[inline(always)] fn bitor(self, rhs: mask32x4) -> Self::Output { rhs.simd.or_mask32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask32x4 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask32x4(self, rhs) } } impl core::ops::BitXorAssign for mask32x4 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask32x4(*self, rhs); } } impl core::ops::BitXor for mask32x4 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i32) -> Self::Output { self.simd.xor_mask32x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask32x4 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i32) { *self = self.simd.xor_mask32x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i32 { type Output = mask32x4; #[inline(always)] fn bitxor(self, rhs: mask32x4) -> Self::Output { rhs.simd.xor_mask32x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask32x4 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask32x4(self) } } impl core::ops::Neg for f64x2 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_f64x2(self) } } impl core::ops::Add for f64x2 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_f64x2(self, rhs) } } impl core::ops::AddAssign for f64x2 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_f64x2(*self, rhs); } } impl core::ops::Add for f64x2 { type Output = Self; #[inline(always)] fn add(self, rhs: f64) -> Self::Output { self.simd.add_f64x2(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for f64x2 { #[inline(always)] fn add_assign(&mut self, rhs: f64) { *self = self.simd.add_f64x2(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for f64 { type Output = f64x2; #[inline(always)] fn add(self, rhs: f64x2) -> Self::Output { rhs.simd.add_f64x2(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for f64x2 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_f64x2(self, rhs) } } impl core::ops::SubAssign for f64x2 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_f64x2(*self, rhs); } } impl core::ops::Sub for f64x2 { type Output = Self; #[inline(always)] fn sub(self, rhs: f64) -> Self::Output { self.simd.sub_f64x2(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for f64x2 { #[inline(always)] fn sub_assign(&mut self, rhs: f64) { *self = self.simd.sub_f64x2(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for f64 { type Output = f64x2; #[inline(always)] fn sub(self, rhs: f64x2) -> Self::Output { rhs.simd.sub_f64x2(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for f64x2 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_f64x2(self, rhs) } } impl core::ops::MulAssign for f64x2 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_f64x2(*self, rhs); } } impl core::ops::Mul for f64x2 { type Output = Self; #[inline(always)] fn mul(self, rhs: f64) -> Self::Output { self.simd.mul_f64x2(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for f64x2 { #[inline(always)] fn mul_assign(&mut self, rhs: f64) { *self = self.simd.mul_f64x2(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for f64 { type Output = f64x2; #[inline(always)] fn mul(self, rhs: f64x2) -> Self::Output { rhs.simd.mul_f64x2(self.simd_into(rhs.simd), rhs) } } impl core::ops::Div for f64x2 { type Output = Self; #[inline(always)] fn div(self, rhs: Self) -> Self::Output { self.simd.div_f64x2(self, rhs) } } impl core::ops::DivAssign for f64x2 { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = self.simd.div_f64x2(*self, rhs); } } impl core::ops::Div for f64x2 { type Output = Self; #[inline(always)] fn div(self, rhs: f64) -> Self::Output { self.simd.div_f64x2(self, rhs.simd_into(self.simd)) } } impl core::ops::DivAssign for f64x2 { #[inline(always)] fn div_assign(&mut self, rhs: f64) { *self = self.simd.div_f64x2(*self, rhs.simd_into(self.simd)); } } impl core::ops::Div> for f64 { type Output = f64x2; #[inline(always)] fn div(self, rhs: f64x2) -> Self::Output { rhs.simd.div_f64x2(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for mask64x2 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask64x2(self, rhs) } } impl core::ops::BitAndAssign for mask64x2 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask64x2(*self, rhs); } } impl core::ops::BitAnd for mask64x2 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i64) -> Self::Output { self.simd.and_mask64x2(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask64x2 { #[inline(always)] fn bitand_assign(&mut self, rhs: i64) { *self = self.simd.and_mask64x2(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i64 { type Output = mask64x2; #[inline(always)] fn bitand(self, rhs: mask64x2) -> Self::Output { rhs.simd.and_mask64x2(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask64x2 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask64x2(self, rhs) } } impl core::ops::BitOrAssign for mask64x2 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask64x2(*self, rhs); } } impl core::ops::BitOr for mask64x2 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i64) -> Self::Output { self.simd.or_mask64x2(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask64x2 { #[inline(always)] fn bitor_assign(&mut self, rhs: i64) { *self = self.simd.or_mask64x2(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i64 { type Output = mask64x2; #[inline(always)] fn bitor(self, rhs: mask64x2) -> Self::Output { rhs.simd.or_mask64x2(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask64x2 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask64x2(self, rhs) } } impl core::ops::BitXorAssign for mask64x2 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask64x2(*self, rhs); } } impl core::ops::BitXor for mask64x2 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i64) -> Self::Output { self.simd.xor_mask64x2(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask64x2 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i64) { *self = self.simd.xor_mask64x2(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i64 { type Output = mask64x2; #[inline(always)] fn bitxor(self, rhs: mask64x2) -> Self::Output { rhs.simd.xor_mask64x2(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask64x2 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask64x2(self) } } impl core::ops::Neg for f32x8 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_f32x8(self) } } impl core::ops::Add for f32x8 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_f32x8(self, rhs) } } impl core::ops::AddAssign for f32x8 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_f32x8(*self, rhs); } } impl core::ops::Add for f32x8 { type Output = Self; #[inline(always)] fn add(self, rhs: f32) -> Self::Output { self.simd.add_f32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for f32x8 { #[inline(always)] fn add_assign(&mut self, rhs: f32) { *self = self.simd.add_f32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for f32 { type Output = f32x8; #[inline(always)] fn add(self, rhs: f32x8) -> Self::Output { rhs.simd.add_f32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for f32x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_f32x8(self, rhs) } } impl core::ops::SubAssign for f32x8 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_f32x8(*self, rhs); } } impl core::ops::Sub for f32x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: f32) -> Self::Output { self.simd.sub_f32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for f32x8 { #[inline(always)] fn sub_assign(&mut self, rhs: f32) { *self = self.simd.sub_f32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for f32 { type Output = f32x8; #[inline(always)] fn sub(self, rhs: f32x8) -> Self::Output { rhs.simd.sub_f32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for f32x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_f32x8(self, rhs) } } impl core::ops::MulAssign for f32x8 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_f32x8(*self, rhs); } } impl core::ops::Mul for f32x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: f32) -> Self::Output { self.simd.mul_f32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for f32x8 { #[inline(always)] fn mul_assign(&mut self, rhs: f32) { *self = self.simd.mul_f32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for f32 { type Output = f32x8; #[inline(always)] fn mul(self, rhs: f32x8) -> Self::Output { rhs.simd.mul_f32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Div for f32x8 { type Output = Self; #[inline(always)] fn div(self, rhs: Self) -> Self::Output { self.simd.div_f32x8(self, rhs) } } impl core::ops::DivAssign for f32x8 { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = self.simd.div_f32x8(*self, rhs); } } impl core::ops::Div for f32x8 { type Output = Self; #[inline(always)] fn div(self, rhs: f32) -> Self::Output { self.simd.div_f32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::DivAssign for f32x8 { #[inline(always)] fn div_assign(&mut self, rhs: f32) { *self = self.simd.div_f32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Div> for f32 { type Output = f32x8; #[inline(always)] fn div(self, rhs: f32x8) -> Self::Output { rhs.simd.div_f32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Neg for i8x32 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_i8x32(self) } } impl core::ops::Add for i8x32 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_i8x32(self, rhs) } } impl core::ops::AddAssign for i8x32 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_i8x32(*self, rhs); } } impl core::ops::Add for i8x32 { type Output = Self; #[inline(always)] fn add(self, rhs: i8) -> Self::Output { self.simd.add_i8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for i8x32 { #[inline(always)] fn add_assign(&mut self, rhs: i8) { *self = self.simd.add_i8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for i8 { type Output = i8x32; #[inline(always)] fn add(self, rhs: i8x32) -> Self::Output { rhs.simd.add_i8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for i8x32 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_i8x32(self, rhs) } } impl core::ops::SubAssign for i8x32 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_i8x32(*self, rhs); } } impl core::ops::Sub for i8x32 { type Output = Self; #[inline(always)] fn sub(self, rhs: i8) -> Self::Output { self.simd.sub_i8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for i8x32 { #[inline(always)] fn sub_assign(&mut self, rhs: i8) { *self = self.simd.sub_i8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for i8 { type Output = i8x32; #[inline(always)] fn sub(self, rhs: i8x32) -> Self::Output { rhs.simd.sub_i8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for i8x32 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_i8x32(self, rhs) } } impl core::ops::MulAssign for i8x32 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_i8x32(*self, rhs); } } impl core::ops::Mul for i8x32 { type Output = Self; #[inline(always)] fn mul(self, rhs: i8) -> Self::Output { self.simd.mul_i8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for i8x32 { #[inline(always)] fn mul_assign(&mut self, rhs: i8) { *self = self.simd.mul_i8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for i8 { type Output = i8x32; #[inline(always)] fn mul(self, rhs: i8x32) -> Self::Output { rhs.simd.mul_i8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for i8x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_i8x32(self, rhs) } } impl core::ops::BitAndAssign for i8x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_i8x32(*self, rhs); } } impl core::ops::BitAnd for i8x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i8) -> Self::Output { self.simd.and_i8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for i8x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: i8) { *self = self.simd.and_i8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i8 { type Output = i8x32; #[inline(always)] fn bitand(self, rhs: i8x32) -> Self::Output { rhs.simd.and_i8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for i8x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_i8x32(self, rhs) } } impl core::ops::BitOrAssign for i8x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_i8x32(*self, rhs); } } impl core::ops::BitOr for i8x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i8) -> Self::Output { self.simd.or_i8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for i8x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: i8) { *self = self.simd.or_i8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i8 { type Output = i8x32; #[inline(always)] fn bitor(self, rhs: i8x32) -> Self::Output { rhs.simd.or_i8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for i8x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_i8x32(self, rhs) } } impl core::ops::BitXorAssign for i8x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_i8x32(*self, rhs); } } impl core::ops::BitXor for i8x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i8) -> Self::Output { self.simd.xor_i8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for i8x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i8) { *self = self.simd.xor_i8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i8 { type Output = i8x32; #[inline(always)] fn bitxor(self, rhs: i8x32) -> Self::Output { rhs.simd.xor_i8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for i8x32 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_i8x32(self, rhs) } } impl core::ops::Shr for i8x32 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_i8x32(self, rhs) } } impl core::ops::ShlAssign for i8x32 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_i8x32(*self, rhs); } } impl core::ops::ShrAssign for i8x32 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_i8x32(*self, rhs); } } impl core::ops::Shr for i8x32 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_i8x32(self, rhs) } } impl core::ops::ShrAssign for i8x32 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_i8x32(*self, rhs); } } impl core::ops::Add for u8x32 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_u8x32(self, rhs) } } impl core::ops::AddAssign for u8x32 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_u8x32(*self, rhs); } } impl core::ops::Add for u8x32 { type Output = Self; #[inline(always)] fn add(self, rhs: u8) -> Self::Output { self.simd.add_u8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for u8x32 { #[inline(always)] fn add_assign(&mut self, rhs: u8) { *self = self.simd.add_u8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for u8 { type Output = u8x32; #[inline(always)] fn add(self, rhs: u8x32) -> Self::Output { rhs.simd.add_u8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for u8x32 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_u8x32(self, rhs) } } impl core::ops::SubAssign for u8x32 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_u8x32(*self, rhs); } } impl core::ops::Sub for u8x32 { type Output = Self; #[inline(always)] fn sub(self, rhs: u8) -> Self::Output { self.simd.sub_u8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for u8x32 { #[inline(always)] fn sub_assign(&mut self, rhs: u8) { *self = self.simd.sub_u8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for u8 { type Output = u8x32; #[inline(always)] fn sub(self, rhs: u8x32) -> Self::Output { rhs.simd.sub_u8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for u8x32 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_u8x32(self, rhs) } } impl core::ops::MulAssign for u8x32 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_u8x32(*self, rhs); } } impl core::ops::Mul for u8x32 { type Output = Self; #[inline(always)] fn mul(self, rhs: u8) -> Self::Output { self.simd.mul_u8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for u8x32 { #[inline(always)] fn mul_assign(&mut self, rhs: u8) { *self = self.simd.mul_u8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for u8 { type Output = u8x32; #[inline(always)] fn mul(self, rhs: u8x32) -> Self::Output { rhs.simd.mul_u8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for u8x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_u8x32(self, rhs) } } impl core::ops::BitAndAssign for u8x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_u8x32(*self, rhs); } } impl core::ops::BitAnd for u8x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: u8) -> Self::Output { self.simd.and_u8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for u8x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: u8) { *self = self.simd.and_u8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for u8 { type Output = u8x32; #[inline(always)] fn bitand(self, rhs: u8x32) -> Self::Output { rhs.simd.and_u8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for u8x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_u8x32(self, rhs) } } impl core::ops::BitOrAssign for u8x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_u8x32(*self, rhs); } } impl core::ops::BitOr for u8x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: u8) -> Self::Output { self.simd.or_u8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for u8x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: u8) { *self = self.simd.or_u8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for u8 { type Output = u8x32; #[inline(always)] fn bitor(self, rhs: u8x32) -> Self::Output { rhs.simd.or_u8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for u8x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_u8x32(self, rhs) } } impl core::ops::BitXorAssign for u8x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_u8x32(*self, rhs); } } impl core::ops::BitXor for u8x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: u8) -> Self::Output { self.simd.xor_u8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for u8x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: u8) { *self = self.simd.xor_u8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for u8 { type Output = u8x32; #[inline(always)] fn bitxor(self, rhs: u8x32) -> Self::Output { rhs.simd.xor_u8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for u8x32 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_u8x32(self, rhs) } } impl core::ops::Shr for u8x32 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_u8x32(self, rhs) } } impl core::ops::ShlAssign for u8x32 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_u8x32(*self, rhs); } } impl core::ops::ShrAssign for u8x32 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_u8x32(*self, rhs); } } impl core::ops::Shr for u8x32 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_u8x32(self, rhs) } } impl core::ops::ShrAssign for u8x32 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_u8x32(*self, rhs); } } impl core::ops::BitAnd for mask8x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask8x32(self, rhs) } } impl core::ops::BitAndAssign for mask8x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask8x32(*self, rhs); } } impl core::ops::BitAnd for mask8x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i8) -> Self::Output { self.simd.and_mask8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask8x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: i8) { *self = self.simd.and_mask8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i8 { type Output = mask8x32; #[inline(always)] fn bitand(self, rhs: mask8x32) -> Self::Output { rhs.simd.and_mask8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask8x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask8x32(self, rhs) } } impl core::ops::BitOrAssign for mask8x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask8x32(*self, rhs); } } impl core::ops::BitOr for mask8x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i8) -> Self::Output { self.simd.or_mask8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask8x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: i8) { *self = self.simd.or_mask8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i8 { type Output = mask8x32; #[inline(always)] fn bitor(self, rhs: mask8x32) -> Self::Output { rhs.simd.or_mask8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask8x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask8x32(self, rhs) } } impl core::ops::BitXorAssign for mask8x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask8x32(*self, rhs); } } impl core::ops::BitXor for mask8x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i8) -> Self::Output { self.simd.xor_mask8x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask8x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i8) { *self = self.simd.xor_mask8x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i8 { type Output = mask8x32; #[inline(always)] fn bitxor(self, rhs: mask8x32) -> Self::Output { rhs.simd.xor_mask8x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask8x32 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask8x32(self) } } impl core::ops::Neg for i16x16 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_i16x16(self) } } impl core::ops::Add for i16x16 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_i16x16(self, rhs) } } impl core::ops::AddAssign for i16x16 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_i16x16(*self, rhs); } } impl core::ops::Add for i16x16 { type Output = Self; #[inline(always)] fn add(self, rhs: i16) -> Self::Output { self.simd.add_i16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for i16x16 { #[inline(always)] fn add_assign(&mut self, rhs: i16) { *self = self.simd.add_i16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for i16 { type Output = i16x16; #[inline(always)] fn add(self, rhs: i16x16) -> Self::Output { rhs.simd.add_i16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for i16x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_i16x16(self, rhs) } } impl core::ops::SubAssign for i16x16 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_i16x16(*self, rhs); } } impl core::ops::Sub for i16x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: i16) -> Self::Output { self.simd.sub_i16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for i16x16 { #[inline(always)] fn sub_assign(&mut self, rhs: i16) { *self = self.simd.sub_i16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for i16 { type Output = i16x16; #[inline(always)] fn sub(self, rhs: i16x16) -> Self::Output { rhs.simd.sub_i16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for i16x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_i16x16(self, rhs) } } impl core::ops::MulAssign for i16x16 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_i16x16(*self, rhs); } } impl core::ops::Mul for i16x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: i16) -> Self::Output { self.simd.mul_i16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for i16x16 { #[inline(always)] fn mul_assign(&mut self, rhs: i16) { *self = self.simd.mul_i16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for i16 { type Output = i16x16; #[inline(always)] fn mul(self, rhs: i16x16) -> Self::Output { rhs.simd.mul_i16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for i16x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_i16x16(self, rhs) } } impl core::ops::BitAndAssign for i16x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_i16x16(*self, rhs); } } impl core::ops::BitAnd for i16x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i16) -> Self::Output { self.simd.and_i16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for i16x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: i16) { *self = self.simd.and_i16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i16 { type Output = i16x16; #[inline(always)] fn bitand(self, rhs: i16x16) -> Self::Output { rhs.simd.and_i16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for i16x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_i16x16(self, rhs) } } impl core::ops::BitOrAssign for i16x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_i16x16(*self, rhs); } } impl core::ops::BitOr for i16x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i16) -> Self::Output { self.simd.or_i16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for i16x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: i16) { *self = self.simd.or_i16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i16 { type Output = i16x16; #[inline(always)] fn bitor(self, rhs: i16x16) -> Self::Output { rhs.simd.or_i16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for i16x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_i16x16(self, rhs) } } impl core::ops::BitXorAssign for i16x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_i16x16(*self, rhs); } } impl core::ops::BitXor for i16x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i16) -> Self::Output { self.simd.xor_i16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for i16x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i16) { *self = self.simd.xor_i16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i16 { type Output = i16x16; #[inline(always)] fn bitxor(self, rhs: i16x16) -> Self::Output { rhs.simd.xor_i16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for i16x16 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_i16x16(self, rhs) } } impl core::ops::Shr for i16x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_i16x16(self, rhs) } } impl core::ops::ShlAssign for i16x16 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_i16x16(*self, rhs); } } impl core::ops::ShrAssign for i16x16 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_i16x16(*self, rhs); } } impl core::ops::Shr for i16x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_i16x16(self, rhs) } } impl core::ops::ShrAssign for i16x16 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_i16x16(*self, rhs); } } impl core::ops::Add for u16x16 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_u16x16(self, rhs) } } impl core::ops::AddAssign for u16x16 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_u16x16(*self, rhs); } } impl core::ops::Add for u16x16 { type Output = Self; #[inline(always)] fn add(self, rhs: u16) -> Self::Output { self.simd.add_u16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for u16x16 { #[inline(always)] fn add_assign(&mut self, rhs: u16) { *self = self.simd.add_u16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for u16 { type Output = u16x16; #[inline(always)] fn add(self, rhs: u16x16) -> Self::Output { rhs.simd.add_u16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for u16x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_u16x16(self, rhs) } } impl core::ops::SubAssign for u16x16 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_u16x16(*self, rhs); } } impl core::ops::Sub for u16x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: u16) -> Self::Output { self.simd.sub_u16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for u16x16 { #[inline(always)] fn sub_assign(&mut self, rhs: u16) { *self = self.simd.sub_u16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for u16 { type Output = u16x16; #[inline(always)] fn sub(self, rhs: u16x16) -> Self::Output { rhs.simd.sub_u16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for u16x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_u16x16(self, rhs) } } impl core::ops::MulAssign for u16x16 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_u16x16(*self, rhs); } } impl core::ops::Mul for u16x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: u16) -> Self::Output { self.simd.mul_u16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for u16x16 { #[inline(always)] fn mul_assign(&mut self, rhs: u16) { *self = self.simd.mul_u16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for u16 { type Output = u16x16; #[inline(always)] fn mul(self, rhs: u16x16) -> Self::Output { rhs.simd.mul_u16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for u16x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_u16x16(self, rhs) } } impl core::ops::BitAndAssign for u16x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_u16x16(*self, rhs); } } impl core::ops::BitAnd for u16x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: u16) -> Self::Output { self.simd.and_u16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for u16x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: u16) { *self = self.simd.and_u16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for u16 { type Output = u16x16; #[inline(always)] fn bitand(self, rhs: u16x16) -> Self::Output { rhs.simd.and_u16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for u16x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_u16x16(self, rhs) } } impl core::ops::BitOrAssign for u16x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_u16x16(*self, rhs); } } impl core::ops::BitOr for u16x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: u16) -> Self::Output { self.simd.or_u16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for u16x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: u16) { *self = self.simd.or_u16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for u16 { type Output = u16x16; #[inline(always)] fn bitor(self, rhs: u16x16) -> Self::Output { rhs.simd.or_u16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for u16x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_u16x16(self, rhs) } } impl core::ops::BitXorAssign for u16x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_u16x16(*self, rhs); } } impl core::ops::BitXor for u16x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: u16) -> Self::Output { self.simd.xor_u16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for u16x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: u16) { *self = self.simd.xor_u16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for u16 { type Output = u16x16; #[inline(always)] fn bitxor(self, rhs: u16x16) -> Self::Output { rhs.simd.xor_u16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for u16x16 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_u16x16(self, rhs) } } impl core::ops::Shr for u16x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_u16x16(self, rhs) } } impl core::ops::ShlAssign for u16x16 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_u16x16(*self, rhs); } } impl core::ops::ShrAssign for u16x16 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_u16x16(*self, rhs); } } impl core::ops::Shr for u16x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_u16x16(self, rhs) } } impl core::ops::ShrAssign for u16x16 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_u16x16(*self, rhs); } } impl core::ops::BitAnd for mask16x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask16x16(self, rhs) } } impl core::ops::BitAndAssign for mask16x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask16x16(*self, rhs); } } impl core::ops::BitAnd for mask16x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i16) -> Self::Output { self.simd.and_mask16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask16x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: i16) { *self = self.simd.and_mask16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i16 { type Output = mask16x16; #[inline(always)] fn bitand(self, rhs: mask16x16) -> Self::Output { rhs.simd.and_mask16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask16x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask16x16(self, rhs) } } impl core::ops::BitOrAssign for mask16x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask16x16(*self, rhs); } } impl core::ops::BitOr for mask16x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i16) -> Self::Output { self.simd.or_mask16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask16x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: i16) { *self = self.simd.or_mask16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i16 { type Output = mask16x16; #[inline(always)] fn bitor(self, rhs: mask16x16) -> Self::Output { rhs.simd.or_mask16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask16x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask16x16(self, rhs) } } impl core::ops::BitXorAssign for mask16x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask16x16(*self, rhs); } } impl core::ops::BitXor for mask16x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i16) -> Self::Output { self.simd.xor_mask16x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask16x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i16) { *self = self.simd.xor_mask16x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i16 { type Output = mask16x16; #[inline(always)] fn bitxor(self, rhs: mask16x16) -> Self::Output { rhs.simd.xor_mask16x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask16x16 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask16x16(self) } } impl core::ops::Neg for i32x8 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_i32x8(self) } } impl core::ops::Add for i32x8 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_i32x8(self, rhs) } } impl core::ops::AddAssign for i32x8 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_i32x8(*self, rhs); } } impl core::ops::Add for i32x8 { type Output = Self; #[inline(always)] fn add(self, rhs: i32) -> Self::Output { self.simd.add_i32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for i32x8 { #[inline(always)] fn add_assign(&mut self, rhs: i32) { *self = self.simd.add_i32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for i32 { type Output = i32x8; #[inline(always)] fn add(self, rhs: i32x8) -> Self::Output { rhs.simd.add_i32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for i32x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_i32x8(self, rhs) } } impl core::ops::SubAssign for i32x8 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_i32x8(*self, rhs); } } impl core::ops::Sub for i32x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: i32) -> Self::Output { self.simd.sub_i32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for i32x8 { #[inline(always)] fn sub_assign(&mut self, rhs: i32) { *self = self.simd.sub_i32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for i32 { type Output = i32x8; #[inline(always)] fn sub(self, rhs: i32x8) -> Self::Output { rhs.simd.sub_i32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for i32x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_i32x8(self, rhs) } } impl core::ops::MulAssign for i32x8 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_i32x8(*self, rhs); } } impl core::ops::Mul for i32x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: i32) -> Self::Output { self.simd.mul_i32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for i32x8 { #[inline(always)] fn mul_assign(&mut self, rhs: i32) { *self = self.simd.mul_i32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for i32 { type Output = i32x8; #[inline(always)] fn mul(self, rhs: i32x8) -> Self::Output { rhs.simd.mul_i32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for i32x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_i32x8(self, rhs) } } impl core::ops::BitAndAssign for i32x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_i32x8(*self, rhs); } } impl core::ops::BitAnd for i32x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i32) -> Self::Output { self.simd.and_i32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for i32x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: i32) { *self = self.simd.and_i32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i32 { type Output = i32x8; #[inline(always)] fn bitand(self, rhs: i32x8) -> Self::Output { rhs.simd.and_i32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for i32x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_i32x8(self, rhs) } } impl core::ops::BitOrAssign for i32x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_i32x8(*self, rhs); } } impl core::ops::BitOr for i32x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i32) -> Self::Output { self.simd.or_i32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for i32x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: i32) { *self = self.simd.or_i32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i32 { type Output = i32x8; #[inline(always)] fn bitor(self, rhs: i32x8) -> Self::Output { rhs.simd.or_i32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for i32x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_i32x8(self, rhs) } } impl core::ops::BitXorAssign for i32x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_i32x8(*self, rhs); } } impl core::ops::BitXor for i32x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i32) -> Self::Output { self.simd.xor_i32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for i32x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i32) { *self = self.simd.xor_i32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i32 { type Output = i32x8; #[inline(always)] fn bitxor(self, rhs: i32x8) -> Self::Output { rhs.simd.xor_i32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for i32x8 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_i32x8(self, rhs) } } impl core::ops::Shr for i32x8 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_i32x8(self, rhs) } } impl core::ops::ShlAssign for i32x8 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_i32x8(*self, rhs); } } impl core::ops::ShrAssign for i32x8 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_i32x8(*self, rhs); } } impl core::ops::Shr for i32x8 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_i32x8(self, rhs) } } impl core::ops::ShrAssign for i32x8 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_i32x8(*self, rhs); } } impl core::ops::Add for u32x8 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_u32x8(self, rhs) } } impl core::ops::AddAssign for u32x8 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_u32x8(*self, rhs); } } impl core::ops::Add for u32x8 { type Output = Self; #[inline(always)] fn add(self, rhs: u32) -> Self::Output { self.simd.add_u32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for u32x8 { #[inline(always)] fn add_assign(&mut self, rhs: u32) { *self = self.simd.add_u32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for u32 { type Output = u32x8; #[inline(always)] fn add(self, rhs: u32x8) -> Self::Output { rhs.simd.add_u32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for u32x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_u32x8(self, rhs) } } impl core::ops::SubAssign for u32x8 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_u32x8(*self, rhs); } } impl core::ops::Sub for u32x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: u32) -> Self::Output { self.simd.sub_u32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for u32x8 { #[inline(always)] fn sub_assign(&mut self, rhs: u32) { *self = self.simd.sub_u32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for u32 { type Output = u32x8; #[inline(always)] fn sub(self, rhs: u32x8) -> Self::Output { rhs.simd.sub_u32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for u32x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_u32x8(self, rhs) } } impl core::ops::MulAssign for u32x8 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_u32x8(*self, rhs); } } impl core::ops::Mul for u32x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: u32) -> Self::Output { self.simd.mul_u32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for u32x8 { #[inline(always)] fn mul_assign(&mut self, rhs: u32) { *self = self.simd.mul_u32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for u32 { type Output = u32x8; #[inline(always)] fn mul(self, rhs: u32x8) -> Self::Output { rhs.simd.mul_u32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for u32x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_u32x8(self, rhs) } } impl core::ops::BitAndAssign for u32x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_u32x8(*self, rhs); } } impl core::ops::BitAnd for u32x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: u32) -> Self::Output { self.simd.and_u32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for u32x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: u32) { *self = self.simd.and_u32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for u32 { type Output = u32x8; #[inline(always)] fn bitand(self, rhs: u32x8) -> Self::Output { rhs.simd.and_u32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for u32x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_u32x8(self, rhs) } } impl core::ops::BitOrAssign for u32x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_u32x8(*self, rhs); } } impl core::ops::BitOr for u32x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: u32) -> Self::Output { self.simd.or_u32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for u32x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: u32) { *self = self.simd.or_u32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for u32 { type Output = u32x8; #[inline(always)] fn bitor(self, rhs: u32x8) -> Self::Output { rhs.simd.or_u32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for u32x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_u32x8(self, rhs) } } impl core::ops::BitXorAssign for u32x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_u32x8(*self, rhs); } } impl core::ops::BitXor for u32x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: u32) -> Self::Output { self.simd.xor_u32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for u32x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: u32) { *self = self.simd.xor_u32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for u32 { type Output = u32x8; #[inline(always)] fn bitxor(self, rhs: u32x8) -> Self::Output { rhs.simd.xor_u32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for u32x8 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_u32x8(self, rhs) } } impl core::ops::Shr for u32x8 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_u32x8(self, rhs) } } impl core::ops::ShlAssign for u32x8 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_u32x8(*self, rhs); } } impl core::ops::ShrAssign for u32x8 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_u32x8(*self, rhs); } } impl core::ops::Shr for u32x8 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_u32x8(self, rhs) } } impl core::ops::ShrAssign for u32x8 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_u32x8(*self, rhs); } } impl core::ops::BitAnd for mask32x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask32x8(self, rhs) } } impl core::ops::BitAndAssign for mask32x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask32x8(*self, rhs); } } impl core::ops::BitAnd for mask32x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i32) -> Self::Output { self.simd.and_mask32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask32x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: i32) { *self = self.simd.and_mask32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i32 { type Output = mask32x8; #[inline(always)] fn bitand(self, rhs: mask32x8) -> Self::Output { rhs.simd.and_mask32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask32x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask32x8(self, rhs) } } impl core::ops::BitOrAssign for mask32x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask32x8(*self, rhs); } } impl core::ops::BitOr for mask32x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i32) -> Self::Output { self.simd.or_mask32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask32x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: i32) { *self = self.simd.or_mask32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i32 { type Output = mask32x8; #[inline(always)] fn bitor(self, rhs: mask32x8) -> Self::Output { rhs.simd.or_mask32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask32x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask32x8(self, rhs) } } impl core::ops::BitXorAssign for mask32x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask32x8(*self, rhs); } } impl core::ops::BitXor for mask32x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i32) -> Self::Output { self.simd.xor_mask32x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask32x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i32) { *self = self.simd.xor_mask32x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i32 { type Output = mask32x8; #[inline(always)] fn bitxor(self, rhs: mask32x8) -> Self::Output { rhs.simd.xor_mask32x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask32x8 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask32x8(self) } } impl core::ops::Neg for f64x4 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_f64x4(self) } } impl core::ops::Add for f64x4 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_f64x4(self, rhs) } } impl core::ops::AddAssign for f64x4 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_f64x4(*self, rhs); } } impl core::ops::Add for f64x4 { type Output = Self; #[inline(always)] fn add(self, rhs: f64) -> Self::Output { self.simd.add_f64x4(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for f64x4 { #[inline(always)] fn add_assign(&mut self, rhs: f64) { *self = self.simd.add_f64x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for f64 { type Output = f64x4; #[inline(always)] fn add(self, rhs: f64x4) -> Self::Output { rhs.simd.add_f64x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for f64x4 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_f64x4(self, rhs) } } impl core::ops::SubAssign for f64x4 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_f64x4(*self, rhs); } } impl core::ops::Sub for f64x4 { type Output = Self; #[inline(always)] fn sub(self, rhs: f64) -> Self::Output { self.simd.sub_f64x4(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for f64x4 { #[inline(always)] fn sub_assign(&mut self, rhs: f64) { *self = self.simd.sub_f64x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for f64 { type Output = f64x4; #[inline(always)] fn sub(self, rhs: f64x4) -> Self::Output { rhs.simd.sub_f64x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for f64x4 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_f64x4(self, rhs) } } impl core::ops::MulAssign for f64x4 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_f64x4(*self, rhs); } } impl core::ops::Mul for f64x4 { type Output = Self; #[inline(always)] fn mul(self, rhs: f64) -> Self::Output { self.simd.mul_f64x4(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for f64x4 { #[inline(always)] fn mul_assign(&mut self, rhs: f64) { *self = self.simd.mul_f64x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for f64 { type Output = f64x4; #[inline(always)] fn mul(self, rhs: f64x4) -> Self::Output { rhs.simd.mul_f64x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Div for f64x4 { type Output = Self; #[inline(always)] fn div(self, rhs: Self) -> Self::Output { self.simd.div_f64x4(self, rhs) } } impl core::ops::DivAssign for f64x4 { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = self.simd.div_f64x4(*self, rhs); } } impl core::ops::Div for f64x4 { type Output = Self; #[inline(always)] fn div(self, rhs: f64) -> Self::Output { self.simd.div_f64x4(self, rhs.simd_into(self.simd)) } } impl core::ops::DivAssign for f64x4 { #[inline(always)] fn div_assign(&mut self, rhs: f64) { *self = self.simd.div_f64x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::Div> for f64 { type Output = f64x4; #[inline(always)] fn div(self, rhs: f64x4) -> Self::Output { rhs.simd.div_f64x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for mask64x4 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask64x4(self, rhs) } } impl core::ops::BitAndAssign for mask64x4 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask64x4(*self, rhs); } } impl core::ops::BitAnd for mask64x4 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i64) -> Self::Output { self.simd.and_mask64x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask64x4 { #[inline(always)] fn bitand_assign(&mut self, rhs: i64) { *self = self.simd.and_mask64x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i64 { type Output = mask64x4; #[inline(always)] fn bitand(self, rhs: mask64x4) -> Self::Output { rhs.simd.and_mask64x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask64x4 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask64x4(self, rhs) } } impl core::ops::BitOrAssign for mask64x4 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask64x4(*self, rhs); } } impl core::ops::BitOr for mask64x4 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i64) -> Self::Output { self.simd.or_mask64x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask64x4 { #[inline(always)] fn bitor_assign(&mut self, rhs: i64) { *self = self.simd.or_mask64x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i64 { type Output = mask64x4; #[inline(always)] fn bitor(self, rhs: mask64x4) -> Self::Output { rhs.simd.or_mask64x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask64x4 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask64x4(self, rhs) } } impl core::ops::BitXorAssign for mask64x4 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask64x4(*self, rhs); } } impl core::ops::BitXor for mask64x4 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i64) -> Self::Output { self.simd.xor_mask64x4(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask64x4 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i64) { *self = self.simd.xor_mask64x4(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i64 { type Output = mask64x4; #[inline(always)] fn bitxor(self, rhs: mask64x4) -> Self::Output { rhs.simd.xor_mask64x4(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask64x4 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask64x4(self) } } impl core::ops::Neg for f32x16 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_f32x16(self) } } impl core::ops::Add for f32x16 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_f32x16(self, rhs) } } impl core::ops::AddAssign for f32x16 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_f32x16(*self, rhs); } } impl core::ops::Add for f32x16 { type Output = Self; #[inline(always)] fn add(self, rhs: f32) -> Self::Output { self.simd.add_f32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for f32x16 { #[inline(always)] fn add_assign(&mut self, rhs: f32) { *self = self.simd.add_f32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for f32 { type Output = f32x16; #[inline(always)] fn add(self, rhs: f32x16) -> Self::Output { rhs.simd.add_f32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for f32x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_f32x16(self, rhs) } } impl core::ops::SubAssign for f32x16 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_f32x16(*self, rhs); } } impl core::ops::Sub for f32x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: f32) -> Self::Output { self.simd.sub_f32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for f32x16 { #[inline(always)] fn sub_assign(&mut self, rhs: f32) { *self = self.simd.sub_f32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for f32 { type Output = f32x16; #[inline(always)] fn sub(self, rhs: f32x16) -> Self::Output { rhs.simd.sub_f32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for f32x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_f32x16(self, rhs) } } impl core::ops::MulAssign for f32x16 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_f32x16(*self, rhs); } } impl core::ops::Mul for f32x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: f32) -> Self::Output { self.simd.mul_f32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for f32x16 { #[inline(always)] fn mul_assign(&mut self, rhs: f32) { *self = self.simd.mul_f32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for f32 { type Output = f32x16; #[inline(always)] fn mul(self, rhs: f32x16) -> Self::Output { rhs.simd.mul_f32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Div for f32x16 { type Output = Self; #[inline(always)] fn div(self, rhs: Self) -> Self::Output { self.simd.div_f32x16(self, rhs) } } impl core::ops::DivAssign for f32x16 { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = self.simd.div_f32x16(*self, rhs); } } impl core::ops::Div for f32x16 { type Output = Self; #[inline(always)] fn div(self, rhs: f32) -> Self::Output { self.simd.div_f32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::DivAssign for f32x16 { #[inline(always)] fn div_assign(&mut self, rhs: f32) { *self = self.simd.div_f32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Div> for f32 { type Output = f32x16; #[inline(always)] fn div(self, rhs: f32x16) -> Self::Output { rhs.simd.div_f32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Neg for i8x64 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_i8x64(self) } } impl core::ops::Add for i8x64 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_i8x64(self, rhs) } } impl core::ops::AddAssign for i8x64 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_i8x64(*self, rhs); } } impl core::ops::Add for i8x64 { type Output = Self; #[inline(always)] fn add(self, rhs: i8) -> Self::Output { self.simd.add_i8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for i8x64 { #[inline(always)] fn add_assign(&mut self, rhs: i8) { *self = self.simd.add_i8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for i8 { type Output = i8x64; #[inline(always)] fn add(self, rhs: i8x64) -> Self::Output { rhs.simd.add_i8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for i8x64 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_i8x64(self, rhs) } } impl core::ops::SubAssign for i8x64 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_i8x64(*self, rhs); } } impl core::ops::Sub for i8x64 { type Output = Self; #[inline(always)] fn sub(self, rhs: i8) -> Self::Output { self.simd.sub_i8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for i8x64 { #[inline(always)] fn sub_assign(&mut self, rhs: i8) { *self = self.simd.sub_i8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for i8 { type Output = i8x64; #[inline(always)] fn sub(self, rhs: i8x64) -> Self::Output { rhs.simd.sub_i8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for i8x64 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_i8x64(self, rhs) } } impl core::ops::MulAssign for i8x64 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_i8x64(*self, rhs); } } impl core::ops::Mul for i8x64 { type Output = Self; #[inline(always)] fn mul(self, rhs: i8) -> Self::Output { self.simd.mul_i8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for i8x64 { #[inline(always)] fn mul_assign(&mut self, rhs: i8) { *self = self.simd.mul_i8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for i8 { type Output = i8x64; #[inline(always)] fn mul(self, rhs: i8x64) -> Self::Output { rhs.simd.mul_i8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for i8x64 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_i8x64(self, rhs) } } impl core::ops::BitAndAssign for i8x64 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_i8x64(*self, rhs); } } impl core::ops::BitAnd for i8x64 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i8) -> Self::Output { self.simd.and_i8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for i8x64 { #[inline(always)] fn bitand_assign(&mut self, rhs: i8) { *self = self.simd.and_i8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i8 { type Output = i8x64; #[inline(always)] fn bitand(self, rhs: i8x64) -> Self::Output { rhs.simd.and_i8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for i8x64 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_i8x64(self, rhs) } } impl core::ops::BitOrAssign for i8x64 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_i8x64(*self, rhs); } } impl core::ops::BitOr for i8x64 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i8) -> Self::Output { self.simd.or_i8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for i8x64 { #[inline(always)] fn bitor_assign(&mut self, rhs: i8) { *self = self.simd.or_i8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i8 { type Output = i8x64; #[inline(always)] fn bitor(self, rhs: i8x64) -> Self::Output { rhs.simd.or_i8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for i8x64 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_i8x64(self, rhs) } } impl core::ops::BitXorAssign for i8x64 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_i8x64(*self, rhs); } } impl core::ops::BitXor for i8x64 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i8) -> Self::Output { self.simd.xor_i8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for i8x64 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i8) { *self = self.simd.xor_i8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i8 { type Output = i8x64; #[inline(always)] fn bitxor(self, rhs: i8x64) -> Self::Output { rhs.simd.xor_i8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for i8x64 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_i8x64(self, rhs) } } impl core::ops::Shr for i8x64 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_i8x64(self, rhs) } } impl core::ops::ShlAssign for i8x64 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_i8x64(*self, rhs); } } impl core::ops::ShrAssign for i8x64 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_i8x64(*self, rhs); } } impl core::ops::Shr for i8x64 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_i8x64(self, rhs) } } impl core::ops::ShrAssign for i8x64 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_i8x64(*self, rhs); } } impl core::ops::Add for u8x64 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_u8x64(self, rhs) } } impl core::ops::AddAssign for u8x64 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_u8x64(*self, rhs); } } impl core::ops::Add for u8x64 { type Output = Self; #[inline(always)] fn add(self, rhs: u8) -> Self::Output { self.simd.add_u8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for u8x64 { #[inline(always)] fn add_assign(&mut self, rhs: u8) { *self = self.simd.add_u8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for u8 { type Output = u8x64; #[inline(always)] fn add(self, rhs: u8x64) -> Self::Output { rhs.simd.add_u8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for u8x64 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_u8x64(self, rhs) } } impl core::ops::SubAssign for u8x64 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_u8x64(*self, rhs); } } impl core::ops::Sub for u8x64 { type Output = Self; #[inline(always)] fn sub(self, rhs: u8) -> Self::Output { self.simd.sub_u8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for u8x64 { #[inline(always)] fn sub_assign(&mut self, rhs: u8) { *self = self.simd.sub_u8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for u8 { type Output = u8x64; #[inline(always)] fn sub(self, rhs: u8x64) -> Self::Output { rhs.simd.sub_u8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for u8x64 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_u8x64(self, rhs) } } impl core::ops::MulAssign for u8x64 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_u8x64(*self, rhs); } } impl core::ops::Mul for u8x64 { type Output = Self; #[inline(always)] fn mul(self, rhs: u8) -> Self::Output { self.simd.mul_u8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for u8x64 { #[inline(always)] fn mul_assign(&mut self, rhs: u8) { *self = self.simd.mul_u8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for u8 { type Output = u8x64; #[inline(always)] fn mul(self, rhs: u8x64) -> Self::Output { rhs.simd.mul_u8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for u8x64 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_u8x64(self, rhs) } } impl core::ops::BitAndAssign for u8x64 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_u8x64(*self, rhs); } } impl core::ops::BitAnd for u8x64 { type Output = Self; #[inline(always)] fn bitand(self, rhs: u8) -> Self::Output { self.simd.and_u8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for u8x64 { #[inline(always)] fn bitand_assign(&mut self, rhs: u8) { *self = self.simd.and_u8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for u8 { type Output = u8x64; #[inline(always)] fn bitand(self, rhs: u8x64) -> Self::Output { rhs.simd.and_u8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for u8x64 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_u8x64(self, rhs) } } impl core::ops::BitOrAssign for u8x64 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_u8x64(*self, rhs); } } impl core::ops::BitOr for u8x64 { type Output = Self; #[inline(always)] fn bitor(self, rhs: u8) -> Self::Output { self.simd.or_u8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for u8x64 { #[inline(always)] fn bitor_assign(&mut self, rhs: u8) { *self = self.simd.or_u8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for u8 { type Output = u8x64; #[inline(always)] fn bitor(self, rhs: u8x64) -> Self::Output { rhs.simd.or_u8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for u8x64 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_u8x64(self, rhs) } } impl core::ops::BitXorAssign for u8x64 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_u8x64(*self, rhs); } } impl core::ops::BitXor for u8x64 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: u8) -> Self::Output { self.simd.xor_u8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for u8x64 { #[inline(always)] fn bitxor_assign(&mut self, rhs: u8) { *self = self.simd.xor_u8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for u8 { type Output = u8x64; #[inline(always)] fn bitxor(self, rhs: u8x64) -> Self::Output { rhs.simd.xor_u8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for u8x64 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_u8x64(self, rhs) } } impl core::ops::Shr for u8x64 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_u8x64(self, rhs) } } impl core::ops::ShlAssign for u8x64 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_u8x64(*self, rhs); } } impl core::ops::ShrAssign for u8x64 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_u8x64(*self, rhs); } } impl core::ops::Shr for u8x64 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_u8x64(self, rhs) } } impl core::ops::ShrAssign for u8x64 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_u8x64(*self, rhs); } } impl core::ops::BitAnd for mask8x64 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask8x64(self, rhs) } } impl core::ops::BitAndAssign for mask8x64 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask8x64(*self, rhs); } } impl core::ops::BitAnd for mask8x64 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i8) -> Self::Output { self.simd.and_mask8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask8x64 { #[inline(always)] fn bitand_assign(&mut self, rhs: i8) { *self = self.simd.and_mask8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i8 { type Output = mask8x64; #[inline(always)] fn bitand(self, rhs: mask8x64) -> Self::Output { rhs.simd.and_mask8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask8x64 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask8x64(self, rhs) } } impl core::ops::BitOrAssign for mask8x64 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask8x64(*self, rhs); } } impl core::ops::BitOr for mask8x64 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i8) -> Self::Output { self.simd.or_mask8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask8x64 { #[inline(always)] fn bitor_assign(&mut self, rhs: i8) { *self = self.simd.or_mask8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i8 { type Output = mask8x64; #[inline(always)] fn bitor(self, rhs: mask8x64) -> Self::Output { rhs.simd.or_mask8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask8x64 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask8x64(self, rhs) } } impl core::ops::BitXorAssign for mask8x64 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask8x64(*self, rhs); } } impl core::ops::BitXor for mask8x64 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i8) -> Self::Output { self.simd.xor_mask8x64(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask8x64 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i8) { *self = self.simd.xor_mask8x64(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i8 { type Output = mask8x64; #[inline(always)] fn bitxor(self, rhs: mask8x64) -> Self::Output { rhs.simd.xor_mask8x64(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask8x64 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask8x64(self) } } impl core::ops::Neg for i16x32 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_i16x32(self) } } impl core::ops::Add for i16x32 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_i16x32(self, rhs) } } impl core::ops::AddAssign for i16x32 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_i16x32(*self, rhs); } } impl core::ops::Add for i16x32 { type Output = Self; #[inline(always)] fn add(self, rhs: i16) -> Self::Output { self.simd.add_i16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for i16x32 { #[inline(always)] fn add_assign(&mut self, rhs: i16) { *self = self.simd.add_i16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for i16 { type Output = i16x32; #[inline(always)] fn add(self, rhs: i16x32) -> Self::Output { rhs.simd.add_i16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for i16x32 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_i16x32(self, rhs) } } impl core::ops::SubAssign for i16x32 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_i16x32(*self, rhs); } } impl core::ops::Sub for i16x32 { type Output = Self; #[inline(always)] fn sub(self, rhs: i16) -> Self::Output { self.simd.sub_i16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for i16x32 { #[inline(always)] fn sub_assign(&mut self, rhs: i16) { *self = self.simd.sub_i16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for i16 { type Output = i16x32; #[inline(always)] fn sub(self, rhs: i16x32) -> Self::Output { rhs.simd.sub_i16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for i16x32 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_i16x32(self, rhs) } } impl core::ops::MulAssign for i16x32 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_i16x32(*self, rhs); } } impl core::ops::Mul for i16x32 { type Output = Self; #[inline(always)] fn mul(self, rhs: i16) -> Self::Output { self.simd.mul_i16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for i16x32 { #[inline(always)] fn mul_assign(&mut self, rhs: i16) { *self = self.simd.mul_i16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for i16 { type Output = i16x32; #[inline(always)] fn mul(self, rhs: i16x32) -> Self::Output { rhs.simd.mul_i16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for i16x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_i16x32(self, rhs) } } impl core::ops::BitAndAssign for i16x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_i16x32(*self, rhs); } } impl core::ops::BitAnd for i16x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i16) -> Self::Output { self.simd.and_i16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for i16x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: i16) { *self = self.simd.and_i16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i16 { type Output = i16x32; #[inline(always)] fn bitand(self, rhs: i16x32) -> Self::Output { rhs.simd.and_i16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for i16x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_i16x32(self, rhs) } } impl core::ops::BitOrAssign for i16x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_i16x32(*self, rhs); } } impl core::ops::BitOr for i16x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i16) -> Self::Output { self.simd.or_i16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for i16x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: i16) { *self = self.simd.or_i16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i16 { type Output = i16x32; #[inline(always)] fn bitor(self, rhs: i16x32) -> Self::Output { rhs.simd.or_i16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for i16x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_i16x32(self, rhs) } } impl core::ops::BitXorAssign for i16x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_i16x32(*self, rhs); } } impl core::ops::BitXor for i16x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i16) -> Self::Output { self.simd.xor_i16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for i16x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i16) { *self = self.simd.xor_i16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i16 { type Output = i16x32; #[inline(always)] fn bitxor(self, rhs: i16x32) -> Self::Output { rhs.simd.xor_i16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for i16x32 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_i16x32(self, rhs) } } impl core::ops::Shr for i16x32 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_i16x32(self, rhs) } } impl core::ops::ShlAssign for i16x32 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_i16x32(*self, rhs); } } impl core::ops::ShrAssign for i16x32 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_i16x32(*self, rhs); } } impl core::ops::Shr for i16x32 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_i16x32(self, rhs) } } impl core::ops::ShrAssign for i16x32 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_i16x32(*self, rhs); } } impl core::ops::Add for u16x32 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_u16x32(self, rhs) } } impl core::ops::AddAssign for u16x32 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_u16x32(*self, rhs); } } impl core::ops::Add for u16x32 { type Output = Self; #[inline(always)] fn add(self, rhs: u16) -> Self::Output { self.simd.add_u16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for u16x32 { #[inline(always)] fn add_assign(&mut self, rhs: u16) { *self = self.simd.add_u16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for u16 { type Output = u16x32; #[inline(always)] fn add(self, rhs: u16x32) -> Self::Output { rhs.simd.add_u16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for u16x32 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_u16x32(self, rhs) } } impl core::ops::SubAssign for u16x32 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_u16x32(*self, rhs); } } impl core::ops::Sub for u16x32 { type Output = Self; #[inline(always)] fn sub(self, rhs: u16) -> Self::Output { self.simd.sub_u16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for u16x32 { #[inline(always)] fn sub_assign(&mut self, rhs: u16) { *self = self.simd.sub_u16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for u16 { type Output = u16x32; #[inline(always)] fn sub(self, rhs: u16x32) -> Self::Output { rhs.simd.sub_u16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for u16x32 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_u16x32(self, rhs) } } impl core::ops::MulAssign for u16x32 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_u16x32(*self, rhs); } } impl core::ops::Mul for u16x32 { type Output = Self; #[inline(always)] fn mul(self, rhs: u16) -> Self::Output { self.simd.mul_u16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for u16x32 { #[inline(always)] fn mul_assign(&mut self, rhs: u16) { *self = self.simd.mul_u16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for u16 { type Output = u16x32; #[inline(always)] fn mul(self, rhs: u16x32) -> Self::Output { rhs.simd.mul_u16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for u16x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_u16x32(self, rhs) } } impl core::ops::BitAndAssign for u16x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_u16x32(*self, rhs); } } impl core::ops::BitAnd for u16x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: u16) -> Self::Output { self.simd.and_u16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for u16x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: u16) { *self = self.simd.and_u16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for u16 { type Output = u16x32; #[inline(always)] fn bitand(self, rhs: u16x32) -> Self::Output { rhs.simd.and_u16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for u16x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_u16x32(self, rhs) } } impl core::ops::BitOrAssign for u16x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_u16x32(*self, rhs); } } impl core::ops::BitOr for u16x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: u16) -> Self::Output { self.simd.or_u16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for u16x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: u16) { *self = self.simd.or_u16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for u16 { type Output = u16x32; #[inline(always)] fn bitor(self, rhs: u16x32) -> Self::Output { rhs.simd.or_u16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for u16x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_u16x32(self, rhs) } } impl core::ops::BitXorAssign for u16x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_u16x32(*self, rhs); } } impl core::ops::BitXor for u16x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: u16) -> Self::Output { self.simd.xor_u16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for u16x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: u16) { *self = self.simd.xor_u16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for u16 { type Output = u16x32; #[inline(always)] fn bitxor(self, rhs: u16x32) -> Self::Output { rhs.simd.xor_u16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for u16x32 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_u16x32(self, rhs) } } impl core::ops::Shr for u16x32 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_u16x32(self, rhs) } } impl core::ops::ShlAssign for u16x32 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_u16x32(*self, rhs); } } impl core::ops::ShrAssign for u16x32 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_u16x32(*self, rhs); } } impl core::ops::Shr for u16x32 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_u16x32(self, rhs) } } impl core::ops::ShrAssign for u16x32 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_u16x32(*self, rhs); } } impl core::ops::BitAnd for mask16x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask16x32(self, rhs) } } impl core::ops::BitAndAssign for mask16x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask16x32(*self, rhs); } } impl core::ops::BitAnd for mask16x32 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i16) -> Self::Output { self.simd.and_mask16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask16x32 { #[inline(always)] fn bitand_assign(&mut self, rhs: i16) { *self = self.simd.and_mask16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i16 { type Output = mask16x32; #[inline(always)] fn bitand(self, rhs: mask16x32) -> Self::Output { rhs.simd.and_mask16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask16x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask16x32(self, rhs) } } impl core::ops::BitOrAssign for mask16x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask16x32(*self, rhs); } } impl core::ops::BitOr for mask16x32 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i16) -> Self::Output { self.simd.or_mask16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask16x32 { #[inline(always)] fn bitor_assign(&mut self, rhs: i16) { *self = self.simd.or_mask16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i16 { type Output = mask16x32; #[inline(always)] fn bitor(self, rhs: mask16x32) -> Self::Output { rhs.simd.or_mask16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask16x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask16x32(self, rhs) } } impl core::ops::BitXorAssign for mask16x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask16x32(*self, rhs); } } impl core::ops::BitXor for mask16x32 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i16) -> Self::Output { self.simd.xor_mask16x32(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask16x32 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i16) { *self = self.simd.xor_mask16x32(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i16 { type Output = mask16x32; #[inline(always)] fn bitxor(self, rhs: mask16x32) -> Self::Output { rhs.simd.xor_mask16x32(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask16x32 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask16x32(self) } } impl core::ops::Neg for i32x16 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_i32x16(self) } } impl core::ops::Add for i32x16 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_i32x16(self, rhs) } } impl core::ops::AddAssign for i32x16 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_i32x16(*self, rhs); } } impl core::ops::Add for i32x16 { type Output = Self; #[inline(always)] fn add(self, rhs: i32) -> Self::Output { self.simd.add_i32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for i32x16 { #[inline(always)] fn add_assign(&mut self, rhs: i32) { *self = self.simd.add_i32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for i32 { type Output = i32x16; #[inline(always)] fn add(self, rhs: i32x16) -> Self::Output { rhs.simd.add_i32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for i32x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_i32x16(self, rhs) } } impl core::ops::SubAssign for i32x16 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_i32x16(*self, rhs); } } impl core::ops::Sub for i32x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: i32) -> Self::Output { self.simd.sub_i32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for i32x16 { #[inline(always)] fn sub_assign(&mut self, rhs: i32) { *self = self.simd.sub_i32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for i32 { type Output = i32x16; #[inline(always)] fn sub(self, rhs: i32x16) -> Self::Output { rhs.simd.sub_i32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for i32x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_i32x16(self, rhs) } } impl core::ops::MulAssign for i32x16 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_i32x16(*self, rhs); } } impl core::ops::Mul for i32x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: i32) -> Self::Output { self.simd.mul_i32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for i32x16 { #[inline(always)] fn mul_assign(&mut self, rhs: i32) { *self = self.simd.mul_i32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for i32 { type Output = i32x16; #[inline(always)] fn mul(self, rhs: i32x16) -> Self::Output { rhs.simd.mul_i32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for i32x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_i32x16(self, rhs) } } impl core::ops::BitAndAssign for i32x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_i32x16(*self, rhs); } } impl core::ops::BitAnd for i32x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i32) -> Self::Output { self.simd.and_i32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for i32x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: i32) { *self = self.simd.and_i32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i32 { type Output = i32x16; #[inline(always)] fn bitand(self, rhs: i32x16) -> Self::Output { rhs.simd.and_i32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for i32x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_i32x16(self, rhs) } } impl core::ops::BitOrAssign for i32x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_i32x16(*self, rhs); } } impl core::ops::BitOr for i32x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i32) -> Self::Output { self.simd.or_i32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for i32x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: i32) { *self = self.simd.or_i32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i32 { type Output = i32x16; #[inline(always)] fn bitor(self, rhs: i32x16) -> Self::Output { rhs.simd.or_i32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for i32x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_i32x16(self, rhs) } } impl core::ops::BitXorAssign for i32x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_i32x16(*self, rhs); } } impl core::ops::BitXor for i32x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i32) -> Self::Output { self.simd.xor_i32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for i32x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i32) { *self = self.simd.xor_i32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i32 { type Output = i32x16; #[inline(always)] fn bitxor(self, rhs: i32x16) -> Self::Output { rhs.simd.xor_i32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for i32x16 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_i32x16(self, rhs) } } impl core::ops::Shr for i32x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_i32x16(self, rhs) } } impl core::ops::ShlAssign for i32x16 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_i32x16(*self, rhs); } } impl core::ops::ShrAssign for i32x16 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_i32x16(*self, rhs); } } impl core::ops::Shr for i32x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_i32x16(self, rhs) } } impl core::ops::ShrAssign for i32x16 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_i32x16(*self, rhs); } } impl core::ops::Add for u32x16 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_u32x16(self, rhs) } } impl core::ops::AddAssign for u32x16 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_u32x16(*self, rhs); } } impl core::ops::Add for u32x16 { type Output = Self; #[inline(always)] fn add(self, rhs: u32) -> Self::Output { self.simd.add_u32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for u32x16 { #[inline(always)] fn add_assign(&mut self, rhs: u32) { *self = self.simd.add_u32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for u32 { type Output = u32x16; #[inline(always)] fn add(self, rhs: u32x16) -> Self::Output { rhs.simd.add_u32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for u32x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_u32x16(self, rhs) } } impl core::ops::SubAssign for u32x16 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_u32x16(*self, rhs); } } impl core::ops::Sub for u32x16 { type Output = Self; #[inline(always)] fn sub(self, rhs: u32) -> Self::Output { self.simd.sub_u32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for u32x16 { #[inline(always)] fn sub_assign(&mut self, rhs: u32) { *self = self.simd.sub_u32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for u32 { type Output = u32x16; #[inline(always)] fn sub(self, rhs: u32x16) -> Self::Output { rhs.simd.sub_u32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for u32x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_u32x16(self, rhs) } } impl core::ops::MulAssign for u32x16 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_u32x16(*self, rhs); } } impl core::ops::Mul for u32x16 { type Output = Self; #[inline(always)] fn mul(self, rhs: u32) -> Self::Output { self.simd.mul_u32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for u32x16 { #[inline(always)] fn mul_assign(&mut self, rhs: u32) { *self = self.simd.mul_u32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for u32 { type Output = u32x16; #[inline(always)] fn mul(self, rhs: u32x16) -> Self::Output { rhs.simd.mul_u32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for u32x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_u32x16(self, rhs) } } impl core::ops::BitAndAssign for u32x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_u32x16(*self, rhs); } } impl core::ops::BitAnd for u32x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: u32) -> Self::Output { self.simd.and_u32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for u32x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: u32) { *self = self.simd.and_u32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for u32 { type Output = u32x16; #[inline(always)] fn bitand(self, rhs: u32x16) -> Self::Output { rhs.simd.and_u32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for u32x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_u32x16(self, rhs) } } impl core::ops::BitOrAssign for u32x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_u32x16(*self, rhs); } } impl core::ops::BitOr for u32x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: u32) -> Self::Output { self.simd.or_u32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for u32x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: u32) { *self = self.simd.or_u32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for u32 { type Output = u32x16; #[inline(always)] fn bitor(self, rhs: u32x16) -> Self::Output { rhs.simd.or_u32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for u32x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_u32x16(self, rhs) } } impl core::ops::BitXorAssign for u32x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_u32x16(*self, rhs); } } impl core::ops::BitXor for u32x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: u32) -> Self::Output { self.simd.xor_u32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for u32x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: u32) { *self = self.simd.xor_u32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for u32 { type Output = u32x16; #[inline(always)] fn bitxor(self, rhs: u32x16) -> Self::Output { rhs.simd.xor_u32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Shl for u32x16 { type Output = Self; #[inline(always)] fn shl(self, rhs: u32) -> Self::Output { self.simd.shl_u32x16(self, rhs) } } impl core::ops::Shr for u32x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: u32) -> Self::Output { self.simd.shr_u32x16(self, rhs) } } impl core::ops::ShlAssign for u32x16 { #[inline(always)] fn shl_assign(&mut self, rhs: u32) { *self = self.simd.shl_u32x16(*self, rhs); } } impl core::ops::ShrAssign for u32x16 { #[inline(always)] fn shr_assign(&mut self, rhs: u32) { *self = self.simd.shr_u32x16(*self, rhs); } } impl core::ops::Shr for u32x16 { type Output = Self; #[inline(always)] fn shr(self, rhs: Self) -> Self::Output { self.simd.shrv_u32x16(self, rhs) } } impl core::ops::ShrAssign for u32x16 { #[inline(always)] fn shr_assign(&mut self, rhs: Self) { *self = self.simd.shrv_u32x16(*self, rhs); } } impl core::ops::BitAnd for mask32x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask32x16(self, rhs) } } impl core::ops::BitAndAssign for mask32x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask32x16(*self, rhs); } } impl core::ops::BitAnd for mask32x16 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i32) -> Self::Output { self.simd.and_mask32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask32x16 { #[inline(always)] fn bitand_assign(&mut self, rhs: i32) { *self = self.simd.and_mask32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i32 { type Output = mask32x16; #[inline(always)] fn bitand(self, rhs: mask32x16) -> Self::Output { rhs.simd.and_mask32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask32x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask32x16(self, rhs) } } impl core::ops::BitOrAssign for mask32x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask32x16(*self, rhs); } } impl core::ops::BitOr for mask32x16 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i32) -> Self::Output { self.simd.or_mask32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask32x16 { #[inline(always)] fn bitor_assign(&mut self, rhs: i32) { *self = self.simd.or_mask32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i32 { type Output = mask32x16; #[inline(always)] fn bitor(self, rhs: mask32x16) -> Self::Output { rhs.simd.or_mask32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask32x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask32x16(self, rhs) } } impl core::ops::BitXorAssign for mask32x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask32x16(*self, rhs); } } impl core::ops::BitXor for mask32x16 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i32) -> Self::Output { self.simd.xor_mask32x16(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask32x16 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i32) { *self = self.simd.xor_mask32x16(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i32 { type Output = mask32x16; #[inline(always)] fn bitxor(self, rhs: mask32x16) -> Self::Output { rhs.simd.xor_mask32x16(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask32x16 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask32x16(self) } } impl core::ops::Neg for f64x8 { type Output = Self; #[inline(always)] fn neg(self) -> Self::Output { self.simd.neg_f64x8(self) } } impl core::ops::Add for f64x8 { type Output = Self; #[inline(always)] fn add(self, rhs: Self) -> Self::Output { self.simd.add_f64x8(self, rhs) } } impl core::ops::AddAssign for f64x8 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = self.simd.add_f64x8(*self, rhs); } } impl core::ops::Add for f64x8 { type Output = Self; #[inline(always)] fn add(self, rhs: f64) -> Self::Output { self.simd.add_f64x8(self, rhs.simd_into(self.simd)) } } impl core::ops::AddAssign for f64x8 { #[inline(always)] fn add_assign(&mut self, rhs: f64) { *self = self.simd.add_f64x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Add> for f64 { type Output = f64x8; #[inline(always)] fn add(self, rhs: f64x8) -> Self::Output { rhs.simd.add_f64x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Sub for f64x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: Self) -> Self::Output { self.simd.sub_f64x8(self, rhs) } } impl core::ops::SubAssign for f64x8 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = self.simd.sub_f64x8(*self, rhs); } } impl core::ops::Sub for f64x8 { type Output = Self; #[inline(always)] fn sub(self, rhs: f64) -> Self::Output { self.simd.sub_f64x8(self, rhs.simd_into(self.simd)) } } impl core::ops::SubAssign for f64x8 { #[inline(always)] fn sub_assign(&mut self, rhs: f64) { *self = self.simd.sub_f64x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Sub> for f64 { type Output = f64x8; #[inline(always)] fn sub(self, rhs: f64x8) -> Self::Output { rhs.simd.sub_f64x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Mul for f64x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: Self) -> Self::Output { self.simd.mul_f64x8(self, rhs) } } impl core::ops::MulAssign for f64x8 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = self.simd.mul_f64x8(*self, rhs); } } impl core::ops::Mul for f64x8 { type Output = Self; #[inline(always)] fn mul(self, rhs: f64) -> Self::Output { self.simd.mul_f64x8(self, rhs.simd_into(self.simd)) } } impl core::ops::MulAssign for f64x8 { #[inline(always)] fn mul_assign(&mut self, rhs: f64) { *self = self.simd.mul_f64x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Mul> for f64 { type Output = f64x8; #[inline(always)] fn mul(self, rhs: f64x8) -> Self::Output { rhs.simd.mul_f64x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Div for f64x8 { type Output = Self; #[inline(always)] fn div(self, rhs: Self) -> Self::Output { self.simd.div_f64x8(self, rhs) } } impl core::ops::DivAssign for f64x8 { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = self.simd.div_f64x8(*self, rhs); } } impl core::ops::Div for f64x8 { type Output = Self; #[inline(always)] fn div(self, rhs: f64) -> Self::Output { self.simd.div_f64x8(self, rhs.simd_into(self.simd)) } } impl core::ops::DivAssign for f64x8 { #[inline(always)] fn div_assign(&mut self, rhs: f64) { *self = self.simd.div_f64x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::Div> for f64 { type Output = f64x8; #[inline(always)] fn div(self, rhs: f64x8) -> Self::Output { rhs.simd.div_f64x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitAnd for mask64x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: Self) -> Self::Output { self.simd.and_mask64x8(self, rhs) } } impl core::ops::BitAndAssign for mask64x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = self.simd.and_mask64x8(*self, rhs); } } impl core::ops::BitAnd for mask64x8 { type Output = Self; #[inline(always)] fn bitand(self, rhs: i64) -> Self::Output { self.simd.and_mask64x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitAndAssign for mask64x8 { #[inline(always)] fn bitand_assign(&mut self, rhs: i64) { *self = self.simd.and_mask64x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitAnd> for i64 { type Output = mask64x8; #[inline(always)] fn bitand(self, rhs: mask64x8) -> Self::Output { rhs.simd.and_mask64x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitOr for mask64x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: Self) -> Self::Output { self.simd.or_mask64x8(self, rhs) } } impl core::ops::BitOrAssign for mask64x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = self.simd.or_mask64x8(*self, rhs); } } impl core::ops::BitOr for mask64x8 { type Output = Self; #[inline(always)] fn bitor(self, rhs: i64) -> Self::Output { self.simd.or_mask64x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitOrAssign for mask64x8 { #[inline(always)] fn bitor_assign(&mut self, rhs: i64) { *self = self.simd.or_mask64x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitOr> for i64 { type Output = mask64x8; #[inline(always)] fn bitor(self, rhs: mask64x8) -> Self::Output { rhs.simd.or_mask64x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::BitXor for mask64x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: Self) -> Self::Output { self.simd.xor_mask64x8(self, rhs) } } impl core::ops::BitXorAssign for mask64x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = self.simd.xor_mask64x8(*self, rhs); } } impl core::ops::BitXor for mask64x8 { type Output = Self; #[inline(always)] fn bitxor(self, rhs: i64) -> Self::Output { self.simd.xor_mask64x8(self, rhs.simd_into(self.simd)) } } impl core::ops::BitXorAssign for mask64x8 { #[inline(always)] fn bitxor_assign(&mut self, rhs: i64) { *self = self.simd.xor_mask64x8(*self, rhs.simd_into(self.simd)); } } impl core::ops::BitXor> for i64 { type Output = mask64x8; #[inline(always)] fn bitxor(self, rhs: mask64x8) -> Self::Output { rhs.simd.xor_mask64x8(self.simd_into(rhs.simd), rhs) } } impl core::ops::Not for mask64x8 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { self.simd.not_mask64x8(self) } } fearless_simd-0.3.0/src/generated/simd_trait.rs000064400000000000000000002031231046102023000176770ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT // This file is autogenerated by fearless_simd_gen use crate::{ Bytes, Level, Select, SimdCvtFloat, SimdCvtTruncate, SimdElement, SimdFrom, SimdInto, seal::Seal, }; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, }; #[doc = r" TODO: docstring"] pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static { type f32s: SimdFloat< f32, Self, Block = f32x4, Mask = Self::mask32s, Bytes = ::Bytes, > + SimdCvtFloat + SimdCvtFloat; type u8s: SimdInt, Mask = Self::mask8s>; type i8s: SimdInt< i8, Self, Block = i8x16, Mask = Self::mask8s, Bytes = ::Bytes, > + core::ops::Neg; type u16s: SimdInt, Mask = Self::mask16s>; type i16s: SimdInt< i16, Self, Block = i16x8, Mask = Self::mask16s, Bytes = ::Bytes, > + core::ops::Neg; type u32s: SimdInt, Mask = Self::mask32s> + SimdCvtTruncate; type i32s: SimdInt< i32, Self, Block = i32x4, Mask = Self::mask32s, Bytes = ::Bytes, > + SimdCvtTruncate + core::ops::Neg; type mask8s: SimdMask, Bytes = ::Bytes> + Select + Select + Select; type mask16s: SimdMask, Bytes = ::Bytes> + Select + Select + Select; type mask32s: SimdMask, Bytes = ::Bytes> + Select + Select + Select + Select; fn level(self) -> Level; #[doc = r" Call function with CPU features enabled."] #[doc = r""] #[doc = r" For performance, the provided function should be `#[inline(always)]`."] fn vectorize R, R>(self, f: F) -> R; fn splat_f32x4(self, val: f32) -> f32x4; fn abs_f32x4(self, a: f32x4) -> f32x4; fn neg_f32x4(self, a: f32x4) -> f32x4; fn sqrt_f32x4(self, a: f32x4) -> f32x4; fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4; fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4; fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4; fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4; fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4; fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4; fn madd_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4; fn msub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4; fn floor_f32x4(self, a: f32x4) -> f32x4; fn fract_f32x4(self, a: f32x4) -> f32x4; fn trunc_f32x4(self, a: f32x4) -> f32x4; fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4; fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8; fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2; fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4; fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16; fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4; fn cvt_u32_f32x4(self, a: f32x4) -> u32x4; fn cvt_i32_f32x4(self, a: f32x4) -> i32x4; fn splat_i8x16(self, val: i8) -> i8x16; fn not_i8x16(self, a: i8x16) -> i8x16; fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16; fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16; fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16; fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16; fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16; fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16; fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16; fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16; fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16; fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32; fn neg_i8x16(self, a: i8x16) -> i8x16; fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16; fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4; fn splat_u8x16(self, val: u8) -> u8x16; fn not_u8x16(self, a: u8x16) -> u8x16; fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16; fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16; fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16; fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16; fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16; fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16; fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16; fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16; fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16; fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32; fn widen_u8x16(self, a: u8x16) -> u16x16; fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4; fn splat_mask8x16(self, val: i8) -> mask8x16; fn not_mask8x16(self, a: mask8x16) -> mask8x16; fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16; fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16; fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16; fn select_mask8x16( self, a: mask8x16, b: mask8x16, c: mask8x16, ) -> mask8x16; fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16; fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32; fn splat_i16x8(self, val: i16) -> i16x8; fn not_i16x8(self, a: i16x8) -> i16x8; fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8; fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8; fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8; fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8; fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8; fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8; fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8; fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8; fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8; fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16; fn neg_i16x8(self, a: i16x8) -> i16x8; fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16; fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4; fn splat_u16x8(self, val: u16) -> u16x8; fn not_u16x8(self, a: u16x8) -> u16x8; fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8; fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8; fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8; fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8; fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8; fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8; fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8; fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8; fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8; fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16; fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16; fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4; fn splat_mask16x8(self, val: i16) -> mask16x8; fn not_mask16x8(self, a: mask16x8) -> mask16x8; fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8; fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8; fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8; fn select_mask16x8( self, a: mask16x8, b: mask16x8, c: mask16x8, ) -> mask16x8; fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8; fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16; fn splat_i32x4(self, val: i32) -> i32x4; fn not_i32x4(self, a: i32x4) -> i32x4; fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4; fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4; fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4; fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4; fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4; fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4; fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4; fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4; fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4; fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8; fn neg_i32x4(self, a: i32x4) -> i32x4; fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16; fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4; fn cvt_f32_i32x4(self, a: i32x4) -> f32x4; fn splat_u32x4(self, val: u32) -> u32x4; fn not_u32x4(self, a: u32x4) -> u32x4; fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4; fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4; fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4; fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4; fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4; fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4; fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4; fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4; fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4; fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8; fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16; fn cvt_f32_u32x4(self, a: u32x4) -> f32x4; fn splat_mask32x4(self, val: i32) -> mask32x4; fn not_mask32x4(self, a: mask32x4) -> mask32x4; fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4; fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4; fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4; fn select_mask32x4( self, a: mask32x4, b: mask32x4, c: mask32x4, ) -> mask32x4; fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4; fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8; fn splat_f64x2(self, val: f64) -> f64x2; fn abs_f64x2(self, a: f64x2) -> f64x2; fn neg_f64x2(self, a: f64x2) -> f64x2; fn sqrt_f64x2(self, a: f64x2) -> f64x2; fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2; fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2; fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2; fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2; fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2; fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2; fn madd_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2; fn msub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2; fn floor_f64x2(self, a: f64x2) -> f64x2; fn fract_f64x2(self, a: f64x2) -> f64x2; fn trunc_f64x2(self, a: f64x2) -> f64x2; fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2; fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4; fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4; fn splat_mask64x2(self, val: i64) -> mask64x2; fn not_mask64x2(self, a: mask64x2) -> mask64x2; fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2; fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2; fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2; fn select_mask64x2( self, a: mask64x2, b: mask64x2, c: mask64x2, ) -> mask64x2; fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2; fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4; fn splat_f32x8(self, val: f32) -> f32x8; fn abs_f32x8(self, a: f32x8) -> f32x8; fn neg_f32x8(self, a: f32x8) -> f32x8; fn sqrt_f32x8(self, a: f32x8) -> f32x8; fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8; fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8; fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8; fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8; fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8; fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8; fn madd_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8; fn msub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8; fn floor_f32x8(self, a: f32x8) -> f32x8; fn fract_f32x8(self, a: f32x8) -> f32x8; fn trunc_f32x8(self, a: f32x8) -> f32x8; fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8; fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16; fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4); fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4; fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8; fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32; fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8; fn cvt_u32_f32x8(self, a: f32x8) -> u32x8; fn cvt_i32_f32x8(self, a: f32x8) -> i32x8; fn splat_i8x32(self, val: i8) -> i8x32; fn not_i8x32(self, a: i8x32) -> i8x32; fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn shr_i8x32(self, a: i8x32, shift: u32) -> i8x32; fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn shl_i8x32(self, a: i8x32, shift: u32) -> i8x32; fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32; fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32; fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32; fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32; fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32; fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32; fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32; fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64; fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16); fn neg_i8x32(self, a: i8x32) -> i8x32; fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32; fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8; fn splat_u8x32(self, val: u8) -> u8x32; fn not_u8x32(self, a: u8x32) -> u8x32; fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn shr_u8x32(self, a: u8x32, shift: u32) -> u8x32; fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn shl_u8x32(self, a: u8x32, shift: u32) -> u8x32; fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32; fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32; fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32; fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32; fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32; fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32; fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32; fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64; fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16); fn widen_u8x32(self, a: u8x32) -> u16x32; fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8; fn splat_mask8x32(self, val: i8) -> mask8x32; fn not_mask8x32(self, a: mask8x32) -> mask8x32; fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32; fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32; fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32; fn select_mask8x32( self, a: mask8x32, b: mask8x32, c: mask8x32, ) -> mask8x32; fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32; fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64; fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16); fn splat_i16x16(self, val: i16) -> i16x16; fn not_i16x16(self, a: i16x16) -> i16x16; fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn shr_i16x16(self, a: i16x16, shift: u32) -> i16x16; fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn shl_i16x16(self, a: i16x16, shift: u32) -> i16x16; fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16; fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16; fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16; fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16; fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16; fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16; fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16; fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32; fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8); fn neg_i16x16(self, a: i16x16) -> i16x16; fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32; fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8; fn splat_u16x16(self, val: u16) -> u16x16; fn not_u16x16(self, a: u16x16) -> u16x16; fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn shr_u16x16(self, a: u16x16, shift: u32) -> u16x16; fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn shl_u16x16(self, a: u16x16, shift: u32) -> u16x16; fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16; fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16; fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16; fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16; fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16; fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16; fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16; fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32; fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8); fn narrow_u16x16(self, a: u16x16) -> u8x16; fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32; fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8; fn splat_mask16x16(self, val: i16) -> mask16x16; fn not_mask16x16(self, a: mask16x16) -> mask16x16; fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16; fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16; fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16; fn select_mask16x16( self, a: mask16x16, b: mask16x16, c: mask16x16, ) -> mask16x16; fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16; fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32; fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8); fn splat_i32x8(self, val: i32) -> i32x8; fn not_i32x8(self, a: i32x8) -> i32x8; fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn shr_i32x8(self, a: i32x8, shift: u32) -> i32x8; fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn shl_i32x8(self, a: i32x8, shift: u32) -> i32x8; fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8; fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8; fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8; fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8; fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8; fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8; fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8; fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16; fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4); fn neg_i32x8(self, a: i32x8) -> i32x8; fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32; fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8; fn cvt_f32_i32x8(self, a: i32x8) -> f32x8; fn splat_u32x8(self, val: u32) -> u32x8; fn not_u32x8(self, a: u32x8) -> u32x8; fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn shr_u32x8(self, a: u32x8, shift: u32) -> u32x8; fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn shl_u32x8(self, a: u32x8, shift: u32) -> u32x8; fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8; fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8; fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8; fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8; fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8; fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8; fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8; fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16; fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4); fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32; fn cvt_f32_u32x8(self, a: u32x8) -> f32x8; fn splat_mask32x8(self, val: i32) -> mask32x8; fn not_mask32x8(self, a: mask32x8) -> mask32x8; fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8; fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8; fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8; fn select_mask32x8( self, a: mask32x8, b: mask32x8, c: mask32x8, ) -> mask32x8; fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8; fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16; fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4); fn splat_f64x4(self, val: f64) -> f64x4; fn abs_f64x4(self, a: f64x4) -> f64x4; fn neg_f64x4(self, a: f64x4) -> f64x4; fn sqrt_f64x4(self, a: f64x4) -> f64x4; fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4; fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4; fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4; fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4; fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4; fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4; fn madd_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4; fn msub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4; fn floor_f64x4(self, a: f64x4) -> f64x4; fn fract_f64x4(self, a: f64x4) -> f64x4; fn trunc_f64x4(self, a: f64x4) -> f64x4; fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4; fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8; fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2); fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8; fn splat_mask64x4(self, val: i64) -> mask64x4; fn not_mask64x4(self, a: mask64x4) -> mask64x4; fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4; fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4; fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4; fn select_mask64x4( self, a: mask64x4, b: mask64x4, c: mask64x4, ) -> mask64x4; fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4; fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8; fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2); fn splat_f32x16(self, val: f32) -> f32x16; fn abs_f32x16(self, a: f32x16) -> f32x16; fn neg_f32x16(self, a: f32x16) -> f32x16; fn sqrt_f32x16(self, a: f32x16) -> f32x16; fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16; fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16; fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16; fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16; fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16; fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16; fn madd_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16; fn msub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16; fn floor_f32x16(self, a: f32x16) -> f32x16; fn fract_f32x16(self, a: f32x16) -> f32x16; fn trunc_f32x16(self, a: f32x16) -> f32x16; fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16; fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8); fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8; fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16; fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16; fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> (); fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64; fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16; fn cvt_u32_f32x16(self, a: f32x16) -> u32x16; fn cvt_i32_f32x16(self, a: f32x16) -> i32x16; fn splat_i8x64(self, val: i8) -> i8x64; fn not_i8x64(self, a: i8x64) -> i8x64; fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn shr_i8x64(self, a: i8x64, shift: u32) -> i8x64; fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn shl_i8x64(self, a: i8x64, shift: u32) -> i8x64; fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64; fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64; fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64; fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64; fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64; fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64; fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64; fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32); fn neg_i8x64(self, a: i8x64) -> i8x64; fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64; fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16; fn splat_u8x64(self, val: u8) -> u8x64; fn not_u8x64(self, a: u8x64) -> u8x64; fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn shr_u8x64(self, a: u8x64, shift: u32) -> u8x64; fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn shl_u8x64(self, a: u8x64, shift: u32) -> u8x64; fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64; fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64; fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64; fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64; fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64; fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64; fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64; fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32); fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64; fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> (); fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16; fn splat_mask8x64(self, val: i8) -> mask8x64; fn not_mask8x64(self, a: mask8x64) -> mask8x64; fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64; fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64; fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64; fn select_mask8x64( self, a: mask8x64, b: mask8x64, c: mask8x64, ) -> mask8x64; fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64; fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32); fn splat_i16x32(self, val: i16) -> i16x32; fn not_i16x32(self, a: i16x32) -> i16x32; fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn shr_i16x32(self, a: i16x32, shift: u32) -> i16x32; fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn shl_i16x32(self, a: i16x32, shift: u32) -> i16x32; fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32; fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32; fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32; fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32; fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32; fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32; fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32; fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16); fn neg_i16x32(self, a: i16x32) -> i16x32; fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64; fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16; fn splat_u16x32(self, val: u16) -> u16x32; fn not_u16x32(self, a: u16x32) -> u16x32; fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn shr_u16x32(self, a: u16x32, shift: u32) -> u16x32; fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn shl_u16x32(self, a: u16x32, shift: u32) -> u16x32; fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32; fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32; fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32; fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32; fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32; fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32; fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32; fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16); fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32; fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> (); fn narrow_u16x32(self, a: u16x32) -> u8x32; fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64; fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16; fn splat_mask16x32(self, val: i16) -> mask16x32; fn not_mask16x32(self, a: mask16x32) -> mask16x32; fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32; fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32; fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32; fn select_mask16x32( self, a: mask16x32, b: mask16x32, c: mask16x32, ) -> mask16x32; fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32; fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16); fn splat_i32x16(self, val: i32) -> i32x16; fn not_i32x16(self, a: i32x16) -> i32x16; fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn shr_i32x16(self, a: i32x16, shift: u32) -> i32x16; fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn shl_i32x16(self, a: i32x16, shift: u32) -> i32x16; fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16; fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16; fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16; fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16; fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16; fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16; fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16; fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8); fn neg_i32x16(self, a: i32x16) -> i32x16; fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64; fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16; fn cvt_f32_i32x16(self, a: i32x16) -> f32x16; fn splat_u32x16(self, val: u32) -> u32x16; fn not_u32x16(self, a: u32x16) -> u32x16; fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn shr_u32x16(self, a: u32x16, shift: u32) -> u32x16; fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn shl_u32x16(self, a: u32x16, shift: u32) -> u32x16; fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16; fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16; fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16; fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16; fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16; fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16; fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16; fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8); fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16; fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> (); fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64; fn cvt_f32_u32x16(self, a: u32x16) -> f32x16; fn splat_mask32x16(self, val: i32) -> mask32x16; fn not_mask32x16(self, a: mask32x16) -> mask32x16; fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16; fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16; fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16; fn select_mask32x16( self, a: mask32x16, b: mask32x16, c: mask32x16, ) -> mask32x16; fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16; fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8); fn splat_f64x8(self, val: f64) -> f64x8; fn abs_f64x8(self, a: f64x8) -> f64x8; fn neg_f64x8(self, a: f64x8) -> f64x8; fn sqrt_f64x8(self, a: f64x8) -> f64x8; fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8; fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8; fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8; fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8; fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8; fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8; fn madd_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8; fn msub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8; fn floor_f64x8(self, a: f64x8) -> f64x8; fn fract_f64x8(self, a: f64x8) -> f64x8; fn trunc_f64x8(self, a: f64x8) -> f64x8; fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8; fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4); fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16; fn splat_mask64x8(self, val: i64) -> mask64x8; fn not_mask64x8(self, a: mask64x8) -> mask64x8; fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8; fn or_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8; fn xor_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8; fn select_mask64x8( self, a: mask64x8, b: mask64x8, c: mask64x8, ) -> mask64x8; fn simd_eq_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8; fn split_mask64x8(self, a: mask64x8) -> (mask64x4, mask64x4); } pub trait SimdBase: Copy + Sync + Send + 'static + crate::Bytes + SimdFrom { const N: usize; #[doc = r" A SIMD vector mask with the same number of elements."] #[doc = r""] #[doc = r" The mask element is represented as an integer which is"] #[doc = r" all-0 for `false` and all-1 for `true`. When we get deep"] #[doc = r" into AVX-512, we need to think about predication masks."] #[doc = r""] #[doc = r" One possibility to consider is that the SIMD trait grows"] #[doc = r" `maskAxB` associated types."] type Mask: SimdMask; #[doc = r" A 128 bit SIMD vector of the same scalar type."] type Block: SimdBase; #[doc = r" Get the [`Simd`] implementation associated with this type."] fn witness(&self) -> S; fn as_slice(&self) -> &[Element]; fn as_mut_slice(&mut self) -> &mut [Element]; #[doc = r" Create a SIMD vector from a slice."] #[doc = r""] #[doc = r" The slice must be the proper width."] fn from_slice(simd: S, slice: &[Element]) -> Self; fn splat(simd: S, val: Element) -> Self; fn block_splat(block: Self::Block) -> Self; } pub trait SimdFloat: SimdBase + core::ops::Neg + core::ops::Add + core::ops::AddAssign + core::ops::Add + core::ops::AddAssign + core::ops::Sub + core::ops::SubAssign + core::ops::Sub + core::ops::SubAssign + core::ops::Mul + core::ops::MulAssign + core::ops::Mul + core::ops::MulAssign + core::ops::Div + core::ops::DivAssign + core::ops::Div + core::ops::DivAssign { #[inline(always)] fn to_int>(self) -> T { T::truncate_from(self) } fn abs(self) -> Self; fn sqrt(self) -> Self; fn copysign(self, rhs: impl SimdInto) -> Self; fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask; fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask; fn simd_le(self, rhs: impl SimdInto) -> Self::Mask; fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask; fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask; fn zip_low(self, rhs: impl SimdInto) -> Self; fn zip_high(self, rhs: impl SimdInto) -> Self; fn unzip_low(self, rhs: impl SimdInto) -> Self; fn unzip_high(self, rhs: impl SimdInto) -> Self; fn max(self, rhs: impl SimdInto) -> Self; fn max_precise(self, rhs: impl SimdInto) -> Self; fn min(self, rhs: impl SimdInto) -> Self; fn min_precise(self, rhs: impl SimdInto) -> Self; fn madd(self, op1: impl SimdInto, op2: impl SimdInto) -> Self; fn msub(self, op1: impl SimdInto, op2: impl SimdInto) -> Self; fn floor(self) -> Self; fn fract(self) -> Self; fn trunc(self) -> Self; } pub trait SimdInt: SimdBase + core::ops::Add + core::ops::AddAssign + core::ops::Add + core::ops::AddAssign + core::ops::Sub + core::ops::SubAssign + core::ops::Sub + core::ops::SubAssign + core::ops::Mul + core::ops::MulAssign + core::ops::Mul + core::ops::MulAssign + core::ops::BitAnd + core::ops::BitAndAssign + core::ops::BitAnd + core::ops::BitAndAssign + core::ops::BitOr + core::ops::BitOrAssign + core::ops::BitOr + core::ops::BitOrAssign + core::ops::BitXor + core::ops::BitXorAssign + core::ops::BitXor + core::ops::BitXorAssign + core::ops::Shl + core::ops::ShlAssign + core::ops::Shr + core::ops::ShrAssign + core::ops::Shr + core::ops::ShrAssign { #[inline(always)] fn to_float>(self) -> T { T::float_from(self) } fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask; fn simd_lt(self, rhs: impl SimdInto) -> Self::Mask; fn simd_le(self, rhs: impl SimdInto) -> Self::Mask; fn simd_ge(self, rhs: impl SimdInto) -> Self::Mask; fn simd_gt(self, rhs: impl SimdInto) -> Self::Mask; fn zip_low(self, rhs: impl SimdInto) -> Self; fn zip_high(self, rhs: impl SimdInto) -> Self; fn unzip_low(self, rhs: impl SimdInto) -> Self; fn unzip_high(self, rhs: impl SimdInto) -> Self; fn min(self, rhs: impl SimdInto) -> Self; fn max(self, rhs: impl SimdInto) -> Self; } pub trait SimdMask: SimdBase + core::ops::Not + core::ops::BitAnd + core::ops::BitOr + core::ops::BitXor { fn simd_eq(self, rhs: impl SimdInto) -> Self::Mask; } fearless_simd-0.3.0/src/generated/simd_types.rs000064400000000000000000007403641046102023000177350ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT // This file is autogenerated by fearless_simd_gen use crate::{Bytes, Select, Simd, SimdCvtFloat, SimdCvtTruncate, SimdFrom, SimdInto}; #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct f32x4 { pub val: [f32; 4], pub simd: S, } impl SimdFrom<[f32; 4], S> for f32x4 { #[inline(always)] fn simd_from(val: [f32; 4], simd: S) -> Self { Self { val: [val[0usize], val[1usize], val[2usize], val[3usize]], simd, } } } impl From> for [f32; 4] { #[inline(always)] fn from(value: f32x4) -> Self { value.val } } impl core::ops::Deref for f32x4 { type Target = [f32; 4]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for f32x4 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for f32x4 { #[inline(always)] fn simd_from(value: f32, simd: S) -> Self { simd.splat_f32x4(value) } } impl Select> for mask32x4 { #[inline(always)] fn select(self, if_true: f32x4, if_false: f32x4) -> f32x4 { self.simd.select_f32x4(self, if_true, if_false) } } impl Bytes for f32x4 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl f32x4 { #[inline(always)] pub fn abs(self) -> f32x4 { self.simd.abs_f32x4(self) } #[inline(always)] pub fn neg(self) -> f32x4 { self.simd.neg_f32x4(self) } #[inline(always)] pub fn sqrt(self) -> f32x4 { self.simd.sqrt_f32x4(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> f32x4 { self.simd.add_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> f32x4 { self.simd.sub_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> f32x4 { self.simd.mul_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn div(self, rhs: impl SimdInto) -> f32x4 { self.simd.div_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn copysign(self, rhs: impl SimdInto) -> f32x4 { self.simd.copysign_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_eq_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_lt_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_le_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_ge_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_gt_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> f32x4 { self.simd.max_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max_precise(self, rhs: impl SimdInto) -> f32x4 { self.simd.max_precise_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> f32x4 { self.simd.min_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min_precise(self, rhs: impl SimdInto) -> f32x4 { self.simd.min_precise_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn floor(self) -> f32x4 { self.simd.floor_f32x4(self) } #[inline(always)] pub fn fract(self) -> f32x4 { self.simd.fract_f32x4(self) } #[inline(always)] pub fn trunc(self) -> f32x4 { self.simd.trunc_f32x4(self) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> f32x8 { self.simd.combine_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_f64(self) -> f64x2 { self.simd.reinterpret_f64_f32x4(self) } #[inline(always)] pub fn reinterpret_i32(self) -> i32x4 { self.simd.reinterpret_i32_f32x4(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x16 { self.simd.reinterpret_u8_f32x4(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x4 { self.simd.reinterpret_u32_f32x4(self) } #[inline(always)] pub fn cvt_u32(self) -> u32x4 { self.simd.cvt_u32_f32x4(self) } #[inline(always)] pub fn cvt_i32(self) -> i32x4 { self.simd.cvt_i32_f32x4(self) } } impl crate::SimdBase for f32x4 { const N: usize = 4; type Mask = mask32x4; type Block = f32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[f32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [f32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[f32]) -> Self { let mut val = [0.0; 4]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: f32) -> Self { simd.splat_f32x4(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdFloat for f32x4 { #[inline(always)] fn abs(self) -> f32x4 { self.simd.abs_f32x4(self) } #[inline(always)] fn sqrt(self) -> f32x4 { self.simd.sqrt_f32x4(self) } #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> f32x4 { self.simd.copysign_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_eq_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_lt_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_le_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_ge_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_gt_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> f32x4 { self.simd.zip_low_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> f32x4 { self.simd.zip_high_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> f32x4 { self.simd.unzip_low_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> f32x4 { self.simd.unzip_high_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> f32x4 { self.simd.max_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max_precise(self, rhs: impl SimdInto) -> f32x4 { self.simd.max_precise_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> f32x4 { self.simd.min_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min_precise(self, rhs: impl SimdInto) -> f32x4 { self.simd.min_precise_f32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn madd(self, op1: impl SimdInto, op2: impl SimdInto) -> f32x4 { self.simd .madd_f32x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn msub(self, op1: impl SimdInto, op2: impl SimdInto) -> f32x4 { self.simd .msub_f32x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn floor(self) -> f32x4 { self.simd.floor_f32x4(self) } #[inline(always)] fn fract(self) -> f32x4 { self.simd.fract_f32x4(self) } #[inline(always)] fn trunc(self) -> f32x4 { self.simd.trunc_f32x4(self) } } impl SimdCvtFloat> for f32x4 { fn float_from(x: u32x4) -> Self { x.simd.cvt_f32_u32x4(x) } } impl SimdCvtFloat> for f32x4 { fn float_from(x: i32x4) -> Self { x.simd.cvt_f32_i32x4(x) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct i8x16 { pub val: [i8; 16], pub simd: S, } impl SimdFrom<[i8; 16], S> for i8x16 { #[inline(always)] fn simd_from(val: [i8; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [i8; 16] { #[inline(always)] fn from(value: i8x16) -> Self { value.val } } impl core::ops::Deref for i8x16 { type Target = [i8; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for i8x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for i8x16 { #[inline(always)] fn simd_from(value: i8, simd: S) -> Self { simd.splat_i8x16(value) } } impl Select> for mask8x16 { #[inline(always)] fn select(self, if_true: i8x16, if_false: i8x16) -> i8x16 { self.simd.select_i8x16(self, if_true, if_false) } } impl Bytes for i8x16 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl i8x16 { #[inline(always)] pub fn not(self) -> i8x16 { self.simd.not_i8x16(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> i8x16 { self.simd.add_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> i8x16 { self.simd.sub_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> i8x16 { self.simd.mul_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> i8x16 { self.simd.and_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> i8x16 { self.simd.or_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> i8x16 { self.simd.xor_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> i8x16 { self.simd.shr_i8x16(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> i8x16 { self.simd.shrv_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> i8x16 { self.simd.shl_i8x16(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_eq_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_lt_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_le_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_ge_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_gt_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> i8x16 { self.simd.min_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> i8x16 { self.simd.max_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> i8x32 { self.simd.combine_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn neg(self) -> i8x16 { self.simd.neg_i8x16(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x16 { self.simd.reinterpret_u8_i8x16(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x4 { self.simd.reinterpret_u32_i8x16(self) } } impl crate::SimdBase for i8x16 { const N: usize = 16; type Mask = mask8x16; type Block = i8x16; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i8] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i8] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { let mut val = [0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i8) -> Self { simd.splat_i8x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdInt for i8x16 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_eq_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_lt_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_le_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_ge_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_gt_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> i8x16 { self.simd.zip_low_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> i8x16 { self.simd.zip_high_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> i8x16 { self.simd.unzip_low_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> i8x16 { self.simd.unzip_high_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> i8x16 { self.simd.min_i8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> i8x16 { self.simd.max_i8x16(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct u8x16 { pub val: [u8; 16], pub simd: S, } impl SimdFrom<[u8; 16], S> for u8x16 { #[inline(always)] fn simd_from(val: [u8; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [u8; 16] { #[inline(always)] fn from(value: u8x16) -> Self { value.val } } impl core::ops::Deref for u8x16 { type Target = [u8; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for u8x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for u8x16 { #[inline(always)] fn simd_from(value: u8, simd: S) -> Self { simd.splat_u8x16(value) } } impl Select> for mask8x16 { #[inline(always)] fn select(self, if_true: u8x16, if_false: u8x16) -> u8x16 { self.simd.select_u8x16(self, if_true, if_false) } } impl Bytes for u8x16 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl u8x16 { #[inline(always)] pub fn not(self) -> u8x16 { self.simd.not_u8x16(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> u8x16 { self.simd.add_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> u8x16 { self.simd.sub_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> u8x16 { self.simd.mul_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> u8x16 { self.simd.and_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> u8x16 { self.simd.or_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> u8x16 { self.simd.xor_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> u8x16 { self.simd.shr_u8x16(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> u8x16 { self.simd.shrv_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> u8x16 { self.simd.shl_u8x16(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_eq_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_lt_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_le_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_ge_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_gt_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> u8x16 { self.simd.min_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> u8x16 { self.simd.max_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> u8x32 { self.simd.combine_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x4 { self.simd.reinterpret_u32_u8x16(self) } } impl crate::SimdBase for u8x16 { const N: usize = 16; type Mask = mask8x16; type Block = u8x16; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[u8] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [u8] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[u8]) -> Self { let mut val = [0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: u8) -> Self { simd.splat_u8x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdInt for u8x16 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_eq_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_lt_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_le_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_ge_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_gt_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> u8x16 { self.simd.zip_low_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> u8x16 { self.simd.zip_high_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> u8x16 { self.simd.unzip_low_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> u8x16 { self.simd.unzip_high_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> u8x16 { self.simd.min_u8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> u8x16 { self.simd.max_u8x16(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct mask8x16 { pub val: [i8; 16], pub simd: S, } impl SimdFrom<[i8; 16], S> for mask8x16 { #[inline(always)] fn simd_from(val: [i8; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [i8; 16] { #[inline(always)] fn from(value: mask8x16) -> Self { value.val } } impl core::ops::Deref for mask8x16 { type Target = [i8; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask8x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask8x16 { #[inline(always)] fn simd_from(value: i8, simd: S) -> Self { simd.splat_mask8x16(value) } } impl Select> for mask8x16 { #[inline(always)] fn select(self, if_true: mask8x16, if_false: mask8x16) -> mask8x16 { self.simd.select_mask8x16(self, if_true, if_false) } } impl Bytes for mask8x16 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask8x16 { #[inline(always)] pub fn not(self) -> mask8x16 { self.simd.not_mask8x16(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask8x16 { self.simd.and_mask8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask8x16 { self.simd.or_mask8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask8x16 { self.simd.xor_mask8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_eq_mask8x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> mask8x32 { self.simd.combine_mask8x16(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask8x16 { const N: usize = 16; type Mask = mask8x16; type Block = mask8x16; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i8] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i8] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { let mut val = [0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i8) -> Self { simd.splat_mask8x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdMask for mask8x16 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask8x16 { self.simd.simd_eq_mask8x16(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct i16x8 { pub val: [i16; 8], pub simd: S, } impl SimdFrom<[i16; 8], S> for i16x8 { #[inline(always)] fn simd_from(val: [i16; 8], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], ], simd, } } } impl From> for [i16; 8] { #[inline(always)] fn from(value: i16x8) -> Self { value.val } } impl core::ops::Deref for i16x8 { type Target = [i16; 8]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for i16x8 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for i16x8 { #[inline(always)] fn simd_from(value: i16, simd: S) -> Self { simd.splat_i16x8(value) } } impl Select> for mask16x8 { #[inline(always)] fn select(self, if_true: i16x8, if_false: i16x8) -> i16x8 { self.simd.select_i16x8(self, if_true, if_false) } } impl Bytes for i16x8 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl i16x8 { #[inline(always)] pub fn not(self) -> i16x8 { self.simd.not_i16x8(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> i16x8 { self.simd.add_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> i16x8 { self.simd.sub_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> i16x8 { self.simd.mul_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> i16x8 { self.simd.and_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> i16x8 { self.simd.or_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> i16x8 { self.simd.xor_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> i16x8 { self.simd.shr_i16x8(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> i16x8 { self.simd.shrv_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> i16x8 { self.simd.shl_i16x8(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_eq_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_lt_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_le_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_ge_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_gt_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> i16x8 { self.simd.min_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> i16x8 { self.simd.max_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> i16x16 { self.simd.combine_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn neg(self) -> i16x8 { self.simd.neg_i16x8(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x16 { self.simd.reinterpret_u8_i16x8(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x4 { self.simd.reinterpret_u32_i16x8(self) } } impl crate::SimdBase for i16x8 { const N: usize = 8; type Mask = mask16x8; type Block = i16x8; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i16] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i16] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { let mut val = [0; 8]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i16) -> Self { simd.splat_i16x8(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdInt for i16x8 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_eq_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_lt_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_le_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_ge_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_gt_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> i16x8 { self.simd.zip_low_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> i16x8 { self.simd.zip_high_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> i16x8 { self.simd.unzip_low_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> i16x8 { self.simd.unzip_high_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> i16x8 { self.simd.min_i16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> i16x8 { self.simd.max_i16x8(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct u16x8 { pub val: [u16; 8], pub simd: S, } impl SimdFrom<[u16; 8], S> for u16x8 { #[inline(always)] fn simd_from(val: [u16; 8], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], ], simd, } } } impl From> for [u16; 8] { #[inline(always)] fn from(value: u16x8) -> Self { value.val } } impl core::ops::Deref for u16x8 { type Target = [u16; 8]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for u16x8 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for u16x8 { #[inline(always)] fn simd_from(value: u16, simd: S) -> Self { simd.splat_u16x8(value) } } impl Select> for mask16x8 { #[inline(always)] fn select(self, if_true: u16x8, if_false: u16x8) -> u16x8 { self.simd.select_u16x8(self, if_true, if_false) } } impl Bytes for u16x8 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl u16x8 { #[inline(always)] pub fn not(self) -> u16x8 { self.simd.not_u16x8(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> u16x8 { self.simd.add_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> u16x8 { self.simd.sub_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> u16x8 { self.simd.mul_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> u16x8 { self.simd.and_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> u16x8 { self.simd.or_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> u16x8 { self.simd.xor_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> u16x8 { self.simd.shr_u16x8(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> u16x8 { self.simd.shrv_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> u16x8 { self.simd.shl_u16x8(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_eq_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_lt_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_le_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_ge_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_gt_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> u16x8 { self.simd.min_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> u16x8 { self.simd.max_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> u16x16 { self.simd.combine_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x16 { self.simd.reinterpret_u8_u16x8(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x4 { self.simd.reinterpret_u32_u16x8(self) } } impl crate::SimdBase for u16x8 { const N: usize = 8; type Mask = mask16x8; type Block = u16x8; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[u16] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [u16] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[u16]) -> Self { let mut val = [0; 8]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: u16) -> Self { simd.splat_u16x8(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdInt for u16x8 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_eq_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_lt_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_le_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_ge_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_gt_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> u16x8 { self.simd.zip_low_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> u16x8 { self.simd.zip_high_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> u16x8 { self.simd.unzip_low_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> u16x8 { self.simd.unzip_high_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> u16x8 { self.simd.min_u16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> u16x8 { self.simd.max_u16x8(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct mask16x8 { pub val: [i16; 8], pub simd: S, } impl SimdFrom<[i16; 8], S> for mask16x8 { #[inline(always)] fn simd_from(val: [i16; 8], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], ], simd, } } } impl From> for [i16; 8] { #[inline(always)] fn from(value: mask16x8) -> Self { value.val } } impl core::ops::Deref for mask16x8 { type Target = [i16; 8]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask16x8 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask16x8 { #[inline(always)] fn simd_from(value: i16, simd: S) -> Self { simd.splat_mask16x8(value) } } impl Select> for mask16x8 { #[inline(always)] fn select(self, if_true: mask16x8, if_false: mask16x8) -> mask16x8 { self.simd.select_mask16x8(self, if_true, if_false) } } impl Bytes for mask16x8 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask16x8 { #[inline(always)] pub fn not(self) -> mask16x8 { self.simd.not_mask16x8(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask16x8 { self.simd.and_mask16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask16x8 { self.simd.or_mask16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask16x8 { self.simd.xor_mask16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_eq_mask16x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> mask16x16 { self.simd.combine_mask16x8(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask16x8 { const N: usize = 8; type Mask = mask16x8; type Block = mask16x8; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i16] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i16] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { let mut val = [0; 8]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i16) -> Self { simd.splat_mask16x8(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdMask for mask16x8 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask16x8 { self.simd.simd_eq_mask16x8(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct i32x4 { pub val: [i32; 4], pub simd: S, } impl SimdFrom<[i32; 4], S> for i32x4 { #[inline(always)] fn simd_from(val: [i32; 4], simd: S) -> Self { Self { val: [val[0usize], val[1usize], val[2usize], val[3usize]], simd, } } } impl From> for [i32; 4] { #[inline(always)] fn from(value: i32x4) -> Self { value.val } } impl core::ops::Deref for i32x4 { type Target = [i32; 4]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for i32x4 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for i32x4 { #[inline(always)] fn simd_from(value: i32, simd: S) -> Self { simd.splat_i32x4(value) } } impl Select> for mask32x4 { #[inline(always)] fn select(self, if_true: i32x4, if_false: i32x4) -> i32x4 { self.simd.select_i32x4(self, if_true, if_false) } } impl Bytes for i32x4 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl i32x4 { #[inline(always)] pub fn not(self) -> i32x4 { self.simd.not_i32x4(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> i32x4 { self.simd.add_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> i32x4 { self.simd.sub_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> i32x4 { self.simd.mul_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> i32x4 { self.simd.and_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> i32x4 { self.simd.or_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> i32x4 { self.simd.xor_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> i32x4 { self.simd.shr_i32x4(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> i32x4 { self.simd.shrv_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> i32x4 { self.simd.shl_i32x4(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_eq_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_lt_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_le_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_ge_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_gt_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> i32x4 { self.simd.min_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> i32x4 { self.simd.max_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> i32x8 { self.simd.combine_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn neg(self) -> i32x4 { self.simd.neg_i32x4(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x16 { self.simd.reinterpret_u8_i32x4(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x4 { self.simd.reinterpret_u32_i32x4(self) } #[inline(always)] pub fn cvt_f32(self) -> f32x4 { self.simd.cvt_f32_i32x4(self) } } impl crate::SimdBase for i32x4 { const N: usize = 4; type Mask = mask32x4; type Block = i32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { let mut val = [0; 4]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i32) -> Self { simd.splat_i32x4(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdInt for i32x4 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_eq_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_lt_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_le_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_ge_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_gt_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> i32x4 { self.simd.zip_low_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> i32x4 { self.simd.zip_high_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> i32x4 { self.simd.unzip_low_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> i32x4 { self.simd.unzip_high_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> i32x4 { self.simd.min_i32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> i32x4 { self.simd.max_i32x4(self, rhs.simd_into(self.simd)) } } impl SimdCvtTruncate> for i32x4 { fn truncate_from(x: f32x4) -> Self { x.simd.cvt_i32_f32x4(x) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct u32x4 { pub val: [u32; 4], pub simd: S, } impl SimdFrom<[u32; 4], S> for u32x4 { #[inline(always)] fn simd_from(val: [u32; 4], simd: S) -> Self { Self { val: [val[0usize], val[1usize], val[2usize], val[3usize]], simd, } } } impl From> for [u32; 4] { #[inline(always)] fn from(value: u32x4) -> Self { value.val } } impl core::ops::Deref for u32x4 { type Target = [u32; 4]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for u32x4 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for u32x4 { #[inline(always)] fn simd_from(value: u32, simd: S) -> Self { simd.splat_u32x4(value) } } impl Select> for mask32x4 { #[inline(always)] fn select(self, if_true: u32x4, if_false: u32x4) -> u32x4 { self.simd.select_u32x4(self, if_true, if_false) } } impl Bytes for u32x4 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl u32x4 { #[inline(always)] pub fn not(self) -> u32x4 { self.simd.not_u32x4(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> u32x4 { self.simd.add_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> u32x4 { self.simd.sub_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> u32x4 { self.simd.mul_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> u32x4 { self.simd.and_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> u32x4 { self.simd.or_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> u32x4 { self.simd.xor_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> u32x4 { self.simd.shr_u32x4(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> u32x4 { self.simd.shrv_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> u32x4 { self.simd.shl_u32x4(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_eq_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_lt_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_le_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_ge_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_gt_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> u32x4 { self.simd.min_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> u32x4 { self.simd.max_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> u32x8 { self.simd.combine_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x16 { self.simd.reinterpret_u8_u32x4(self) } #[inline(always)] pub fn cvt_f32(self) -> f32x4 { self.simd.cvt_f32_u32x4(self) } } impl crate::SimdBase for u32x4 { const N: usize = 4; type Mask = mask32x4; type Block = u32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[u32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [u32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[u32]) -> Self { let mut val = [0; 4]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: u32) -> Self { simd.splat_u32x4(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdInt for u32x4 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_eq_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_lt_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_le_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_ge_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_gt_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> u32x4 { self.simd.zip_low_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> u32x4 { self.simd.zip_high_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> u32x4 { self.simd.unzip_low_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> u32x4 { self.simd.unzip_high_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> u32x4 { self.simd.min_u32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> u32x4 { self.simd.max_u32x4(self, rhs.simd_into(self.simd)) } } impl SimdCvtTruncate> for u32x4 { fn truncate_from(x: f32x4) -> Self { x.simd.cvt_u32_f32x4(x) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct mask32x4 { pub val: [i32; 4], pub simd: S, } impl SimdFrom<[i32; 4], S> for mask32x4 { #[inline(always)] fn simd_from(val: [i32; 4], simd: S) -> Self { Self { val: [val[0usize], val[1usize], val[2usize], val[3usize]], simd, } } } impl From> for [i32; 4] { #[inline(always)] fn from(value: mask32x4) -> Self { value.val } } impl core::ops::Deref for mask32x4 { type Target = [i32; 4]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask32x4 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask32x4 { #[inline(always)] fn simd_from(value: i32, simd: S) -> Self { simd.splat_mask32x4(value) } } impl Select> for mask32x4 { #[inline(always)] fn select(self, if_true: mask32x4, if_false: mask32x4) -> mask32x4 { self.simd.select_mask32x4(self, if_true, if_false) } } impl Bytes for mask32x4 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask32x4 { #[inline(always)] pub fn not(self) -> mask32x4 { self.simd.not_mask32x4(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask32x4 { self.simd.and_mask32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask32x4 { self.simd.or_mask32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask32x4 { self.simd.xor_mask32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_eq_mask32x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> mask32x8 { self.simd.combine_mask32x4(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask32x4 { const N: usize = 4; type Mask = mask32x4; type Block = mask32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { let mut val = [0; 4]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i32) -> Self { simd.splat_mask32x4(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdMask for mask32x4 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x4 { self.simd.simd_eq_mask32x4(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct f64x2 { pub val: [f64; 2], pub simd: S, } impl SimdFrom<[f64; 2], S> for f64x2 { #[inline(always)] fn simd_from(val: [f64; 2], simd: S) -> Self { Self { val: [val[0usize], val[1usize]], simd, } } } impl From> for [f64; 2] { #[inline(always)] fn from(value: f64x2) -> Self { value.val } } impl core::ops::Deref for f64x2 { type Target = [f64; 2]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for f64x2 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for f64x2 { #[inline(always)] fn simd_from(value: f64, simd: S) -> Self { simd.splat_f64x2(value) } } impl Select> for mask64x2 { #[inline(always)] fn select(self, if_true: f64x2, if_false: f64x2) -> f64x2 { self.simd.select_f64x2(self, if_true, if_false) } } impl Bytes for f64x2 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl f64x2 { #[inline(always)] pub fn abs(self) -> f64x2 { self.simd.abs_f64x2(self) } #[inline(always)] pub fn neg(self) -> f64x2 { self.simd.neg_f64x2(self) } #[inline(always)] pub fn sqrt(self) -> f64x2 { self.simd.sqrt_f64x2(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> f64x2 { self.simd.add_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> f64x2 { self.simd.sub_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> f64x2 { self.simd.mul_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn div(self, rhs: impl SimdInto) -> f64x2 { self.simd.div_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn copysign(self, rhs: impl SimdInto) -> f64x2 { self.simd.copysign_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_eq_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_lt_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_le_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_ge_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_gt_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> f64x2 { self.simd.max_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max_precise(self, rhs: impl SimdInto) -> f64x2 { self.simd.max_precise_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> f64x2 { self.simd.min_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min_precise(self, rhs: impl SimdInto) -> f64x2 { self.simd.min_precise_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn floor(self) -> f64x2 { self.simd.floor_f64x2(self) } #[inline(always)] pub fn fract(self) -> f64x2 { self.simd.fract_f64x2(self) } #[inline(always)] pub fn trunc(self) -> f64x2 { self.simd.trunc_f64x2(self) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> f64x4 { self.simd.combine_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_f32(self) -> f32x4 { self.simd.reinterpret_f32_f64x2(self) } } impl crate::SimdBase for f64x2 { const N: usize = 2; type Mask = mask64x2; type Block = f64x2; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[f64] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [f64] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[f64]) -> Self { let mut val = [0.0; 2]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: f64) -> Self { simd.splat_f64x2(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdFloat for f64x2 { #[inline(always)] fn abs(self) -> f64x2 { self.simd.abs_f64x2(self) } #[inline(always)] fn sqrt(self) -> f64x2 { self.simd.sqrt_f64x2(self) } #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> f64x2 { self.simd.copysign_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_eq_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_lt_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_le_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_ge_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_gt_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> f64x2 { self.simd.zip_low_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> f64x2 { self.simd.zip_high_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> f64x2 { self.simd.unzip_low_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> f64x2 { self.simd.unzip_high_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> f64x2 { self.simd.max_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max_precise(self, rhs: impl SimdInto) -> f64x2 { self.simd.max_precise_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> f64x2 { self.simd.min_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min_precise(self, rhs: impl SimdInto) -> f64x2 { self.simd.min_precise_f64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] fn madd(self, op1: impl SimdInto, op2: impl SimdInto) -> f64x2 { self.simd .madd_f64x2(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn msub(self, op1: impl SimdInto, op2: impl SimdInto) -> f64x2 { self.simd .msub_f64x2(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn floor(self) -> f64x2 { self.simd.floor_f64x2(self) } #[inline(always)] fn fract(self) -> f64x2 { self.simd.fract_f64x2(self) } #[inline(always)] fn trunc(self) -> f64x2 { self.simd.trunc_f64x2(self) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(16))] pub struct mask64x2 { pub val: [i64; 2], pub simd: S, } impl SimdFrom<[i64; 2], S> for mask64x2 { #[inline(always)] fn simd_from(val: [i64; 2], simd: S) -> Self { Self { val: [val[0usize], val[1usize]], simd, } } } impl From> for [i64; 2] { #[inline(always)] fn from(value: mask64x2) -> Self { value.val } } impl core::ops::Deref for mask64x2 { type Target = [i64; 2]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask64x2 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask64x2 { #[inline(always)] fn simd_from(value: i64, simd: S) -> Self { simd.splat_mask64x2(value) } } impl Select> for mask64x2 { #[inline(always)] fn select(self, if_true: mask64x2, if_false: mask64x2) -> mask64x2 { self.simd.select_mask64x2(self, if_true, if_false) } } impl Bytes for mask64x2 { type Bytes = u8x16; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x16 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask64x2 { #[inline(always)] pub fn not(self) -> mask64x2 { self.simd.not_mask64x2(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask64x2 { self.simd.and_mask64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask64x2 { self.simd.or_mask64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask64x2 { self.simd.xor_mask64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_eq_mask64x2(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> mask64x4 { self.simd.combine_mask64x2(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask64x2 { const N: usize = 2; type Mask = mask64x2; type Block = mask64x2; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i64] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i64] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i64]) -> Self { let mut val = [0; 2]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i64) -> Self { simd.splat_mask64x2(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block } } impl crate::SimdMask for mask64x2 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask64x2 { self.simd.simd_eq_mask64x2(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct f32x8 { pub val: [f32; 8], pub simd: S, } impl SimdFrom<[f32; 8], S> for f32x8 { #[inline(always)] fn simd_from(val: [f32; 8], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], ], simd, } } } impl From> for [f32; 8] { #[inline(always)] fn from(value: f32x8) -> Self { value.val } } impl core::ops::Deref for f32x8 { type Target = [f32; 8]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for f32x8 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for f32x8 { #[inline(always)] fn simd_from(value: f32, simd: S) -> Self { simd.splat_f32x8(value) } } impl Select> for mask32x8 { #[inline(always)] fn select(self, if_true: f32x8, if_false: f32x8) -> f32x8 { self.simd.select_f32x8(self, if_true, if_false) } } impl Bytes for f32x8 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl f32x8 { #[inline(always)] pub fn abs(self) -> f32x8 { self.simd.abs_f32x8(self) } #[inline(always)] pub fn neg(self) -> f32x8 { self.simd.neg_f32x8(self) } #[inline(always)] pub fn sqrt(self) -> f32x8 { self.simd.sqrt_f32x8(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> f32x8 { self.simd.add_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> f32x8 { self.simd.sub_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> f32x8 { self.simd.mul_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn div(self, rhs: impl SimdInto) -> f32x8 { self.simd.div_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn copysign(self, rhs: impl SimdInto) -> f32x8 { self.simd.copysign_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_eq_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_lt_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_le_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_ge_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_gt_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> f32x8 { self.simd.max_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max_precise(self, rhs: impl SimdInto) -> f32x8 { self.simd.max_precise_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> f32x8 { self.simd.min_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min_precise(self, rhs: impl SimdInto) -> f32x8 { self.simd.min_precise_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn floor(self) -> f32x8 { self.simd.floor_f32x8(self) } #[inline(always)] pub fn fract(self) -> f32x8 { self.simd.fract_f32x8(self) } #[inline(always)] pub fn trunc(self) -> f32x8 { self.simd.trunc_f32x8(self) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> f32x16 { self.simd.combine_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_f64(self) -> f64x4 { self.simd.reinterpret_f64_f32x8(self) } #[inline(always)] pub fn reinterpret_i32(self) -> i32x8 { self.simd.reinterpret_i32_f32x8(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x32 { self.simd.reinterpret_u8_f32x8(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x8 { self.simd.reinterpret_u32_f32x8(self) } #[inline(always)] pub fn cvt_u32(self) -> u32x8 { self.simd.cvt_u32_f32x8(self) } #[inline(always)] pub fn cvt_i32(self) -> i32x8 { self.simd.cvt_i32_f32x8(self) } } impl crate::SimdBase for f32x8 { const N: usize = 8; type Mask = mask32x8; type Block = f32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[f32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [f32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[f32]) -> Self { let mut val = [0.0; 8]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: f32) -> Self { simd.splat_f32x8(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdFloat for f32x8 { #[inline(always)] fn abs(self) -> f32x8 { self.simd.abs_f32x8(self) } #[inline(always)] fn sqrt(self) -> f32x8 { self.simd.sqrt_f32x8(self) } #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> f32x8 { self.simd.copysign_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_eq_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_lt_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_le_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_ge_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_gt_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> f32x8 { self.simd.zip_low_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> f32x8 { self.simd.zip_high_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> f32x8 { self.simd.unzip_low_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> f32x8 { self.simd.unzip_high_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> f32x8 { self.simd.max_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max_precise(self, rhs: impl SimdInto) -> f32x8 { self.simd.max_precise_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> f32x8 { self.simd.min_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min_precise(self, rhs: impl SimdInto) -> f32x8 { self.simd.min_precise_f32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn madd(self, op1: impl SimdInto, op2: impl SimdInto) -> f32x8 { self.simd .madd_f32x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn msub(self, op1: impl SimdInto, op2: impl SimdInto) -> f32x8 { self.simd .msub_f32x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn floor(self) -> f32x8 { self.simd.floor_f32x8(self) } #[inline(always)] fn fract(self) -> f32x8 { self.simd.fract_f32x8(self) } #[inline(always)] fn trunc(self) -> f32x8 { self.simd.trunc_f32x8(self) } } impl SimdCvtFloat> for f32x8 { fn float_from(x: u32x8) -> Self { x.simd.cvt_f32_u32x8(x) } } impl SimdCvtFloat> for f32x8 { fn float_from(x: i32x8) -> Self { x.simd.cvt_f32_i32x8(x) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct i8x32 { pub val: [i8; 32], pub simd: S, } impl SimdFrom<[i8; 32], S> for i8x32 { #[inline(always)] fn simd_from(val: [i8; 32], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], val[16usize], val[17usize], val[18usize], val[19usize], val[20usize], val[21usize], val[22usize], val[23usize], val[24usize], val[25usize], val[26usize], val[27usize], val[28usize], val[29usize], val[30usize], val[31usize], ], simd, } } } impl From> for [i8; 32] { #[inline(always)] fn from(value: i8x32) -> Self { value.val } } impl core::ops::Deref for i8x32 { type Target = [i8; 32]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for i8x32 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for i8x32 { #[inline(always)] fn simd_from(value: i8, simd: S) -> Self { simd.splat_i8x32(value) } } impl Select> for mask8x32 { #[inline(always)] fn select(self, if_true: i8x32, if_false: i8x32) -> i8x32 { self.simd.select_i8x32(self, if_true, if_false) } } impl Bytes for i8x32 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl i8x32 { #[inline(always)] pub fn not(self) -> i8x32 { self.simd.not_i8x32(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> i8x32 { self.simd.add_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> i8x32 { self.simd.sub_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> i8x32 { self.simd.mul_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> i8x32 { self.simd.and_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> i8x32 { self.simd.or_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> i8x32 { self.simd.xor_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> i8x32 { self.simd.shr_i8x32(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> i8x32 { self.simd.shrv_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> i8x32 { self.simd.shl_i8x32(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_eq_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_lt_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_le_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_ge_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_gt_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> i8x32 { self.simd.min_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> i8x32 { self.simd.max_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> i8x64 { self.simd.combine_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn neg(self) -> i8x32 { self.simd.neg_i8x32(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x32 { self.simd.reinterpret_u8_i8x32(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x8 { self.simd.reinterpret_u32_i8x32(self) } } impl crate::SimdBase for i8x32 { const N: usize = 32; type Mask = mask8x32; type Block = i8x16; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i8] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i8] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { let mut val = [0; 32]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i8) -> Self { simd.splat_i8x32(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdInt for i8x32 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_eq_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_lt_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_le_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_ge_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_gt_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> i8x32 { self.simd.zip_low_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> i8x32 { self.simd.zip_high_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> i8x32 { self.simd.unzip_low_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> i8x32 { self.simd.unzip_high_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> i8x32 { self.simd.min_i8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> i8x32 { self.simd.max_i8x32(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct u8x32 { pub val: [u8; 32], pub simd: S, } impl SimdFrom<[u8; 32], S> for u8x32 { #[inline(always)] fn simd_from(val: [u8; 32], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], val[16usize], val[17usize], val[18usize], val[19usize], val[20usize], val[21usize], val[22usize], val[23usize], val[24usize], val[25usize], val[26usize], val[27usize], val[28usize], val[29usize], val[30usize], val[31usize], ], simd, } } } impl From> for [u8; 32] { #[inline(always)] fn from(value: u8x32) -> Self { value.val } } impl core::ops::Deref for u8x32 { type Target = [u8; 32]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for u8x32 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for u8x32 { #[inline(always)] fn simd_from(value: u8, simd: S) -> Self { simd.splat_u8x32(value) } } impl Select> for mask8x32 { #[inline(always)] fn select(self, if_true: u8x32, if_false: u8x32) -> u8x32 { self.simd.select_u8x32(self, if_true, if_false) } } impl Bytes for u8x32 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl u8x32 { #[inline(always)] pub fn not(self) -> u8x32 { self.simd.not_u8x32(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> u8x32 { self.simd.add_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> u8x32 { self.simd.sub_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> u8x32 { self.simd.mul_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> u8x32 { self.simd.and_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> u8x32 { self.simd.or_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> u8x32 { self.simd.xor_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> u8x32 { self.simd.shr_u8x32(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> u8x32 { self.simd.shrv_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> u8x32 { self.simd.shl_u8x32(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_eq_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_lt_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_le_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_ge_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_gt_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> u8x32 { self.simd.min_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> u8x32 { self.simd.max_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> u8x64 { self.simd.combine_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x8 { self.simd.reinterpret_u32_u8x32(self) } } impl crate::SimdBase for u8x32 { const N: usize = 32; type Mask = mask8x32; type Block = u8x16; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[u8] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [u8] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[u8]) -> Self { let mut val = [0; 32]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: u8) -> Self { simd.splat_u8x32(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdInt for u8x32 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_eq_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_lt_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_le_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_ge_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_gt_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> u8x32 { self.simd.zip_low_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> u8x32 { self.simd.zip_high_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> u8x32 { self.simd.unzip_low_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> u8x32 { self.simd.unzip_high_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> u8x32 { self.simd.min_u8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> u8x32 { self.simd.max_u8x32(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct mask8x32 { pub val: [i8; 32], pub simd: S, } impl SimdFrom<[i8; 32], S> for mask8x32 { #[inline(always)] fn simd_from(val: [i8; 32], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], val[16usize], val[17usize], val[18usize], val[19usize], val[20usize], val[21usize], val[22usize], val[23usize], val[24usize], val[25usize], val[26usize], val[27usize], val[28usize], val[29usize], val[30usize], val[31usize], ], simd, } } } impl From> for [i8; 32] { #[inline(always)] fn from(value: mask8x32) -> Self { value.val } } impl core::ops::Deref for mask8x32 { type Target = [i8; 32]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask8x32 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask8x32 { #[inline(always)] fn simd_from(value: i8, simd: S) -> Self { simd.splat_mask8x32(value) } } impl Select> for mask8x32 { #[inline(always)] fn select(self, if_true: mask8x32, if_false: mask8x32) -> mask8x32 { self.simd.select_mask8x32(self, if_true, if_false) } } impl Bytes for mask8x32 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask8x32 { #[inline(always)] pub fn not(self) -> mask8x32 { self.simd.not_mask8x32(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask8x32 { self.simd.and_mask8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask8x32 { self.simd.or_mask8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask8x32 { self.simd.xor_mask8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_eq_mask8x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> mask8x64 { self.simd.combine_mask8x32(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask8x32 { const N: usize = 32; type Mask = mask8x32; type Block = mask8x16; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i8] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i8] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { let mut val = [0; 32]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i8) -> Self { simd.splat_mask8x32(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdMask for mask8x32 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask8x32 { self.simd.simd_eq_mask8x32(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct i16x16 { pub val: [i16; 16], pub simd: S, } impl SimdFrom<[i16; 16], S> for i16x16 { #[inline(always)] fn simd_from(val: [i16; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [i16; 16] { #[inline(always)] fn from(value: i16x16) -> Self { value.val } } impl core::ops::Deref for i16x16 { type Target = [i16; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for i16x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for i16x16 { #[inline(always)] fn simd_from(value: i16, simd: S) -> Self { simd.splat_i16x16(value) } } impl Select> for mask16x16 { #[inline(always)] fn select(self, if_true: i16x16, if_false: i16x16) -> i16x16 { self.simd.select_i16x16(self, if_true, if_false) } } impl Bytes for i16x16 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl i16x16 { #[inline(always)] pub fn not(self) -> i16x16 { self.simd.not_i16x16(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> i16x16 { self.simd.add_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> i16x16 { self.simd.sub_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> i16x16 { self.simd.mul_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> i16x16 { self.simd.and_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> i16x16 { self.simd.or_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> i16x16 { self.simd.xor_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> i16x16 { self.simd.shr_i16x16(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> i16x16 { self.simd.shrv_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> i16x16 { self.simd.shl_i16x16(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_eq_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_lt_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_le_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_ge_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_gt_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> i16x16 { self.simd.min_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> i16x16 { self.simd.max_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> i16x32 { self.simd.combine_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn neg(self) -> i16x16 { self.simd.neg_i16x16(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x32 { self.simd.reinterpret_u8_i16x16(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x8 { self.simd.reinterpret_u32_i16x16(self) } } impl crate::SimdBase for i16x16 { const N: usize = 16; type Mask = mask16x16; type Block = i16x8; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i16] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i16] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { let mut val = [0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i16) -> Self { simd.splat_i16x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdInt for i16x16 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_eq_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_lt_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_le_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_ge_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_gt_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> i16x16 { self.simd.zip_low_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> i16x16 { self.simd.zip_high_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> i16x16 { self.simd.unzip_low_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> i16x16 { self.simd.unzip_high_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> i16x16 { self.simd.min_i16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> i16x16 { self.simd.max_i16x16(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct u16x16 { pub val: [u16; 16], pub simd: S, } impl SimdFrom<[u16; 16], S> for u16x16 { #[inline(always)] fn simd_from(val: [u16; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [u16; 16] { #[inline(always)] fn from(value: u16x16) -> Self { value.val } } impl core::ops::Deref for u16x16 { type Target = [u16; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for u16x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for u16x16 { #[inline(always)] fn simd_from(value: u16, simd: S) -> Self { simd.splat_u16x16(value) } } impl Select> for mask16x16 { #[inline(always)] fn select(self, if_true: u16x16, if_false: u16x16) -> u16x16 { self.simd.select_u16x16(self, if_true, if_false) } } impl Bytes for u16x16 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl u16x16 { #[inline(always)] pub fn not(self) -> u16x16 { self.simd.not_u16x16(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> u16x16 { self.simd.add_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> u16x16 { self.simd.sub_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> u16x16 { self.simd.mul_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> u16x16 { self.simd.and_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> u16x16 { self.simd.or_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> u16x16 { self.simd.xor_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> u16x16 { self.simd.shr_u16x16(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> u16x16 { self.simd.shrv_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> u16x16 { self.simd.shl_u16x16(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_eq_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_lt_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_le_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_ge_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_gt_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> u16x16 { self.simd.min_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> u16x16 { self.simd.max_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> u16x32 { self.simd.combine_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x32 { self.simd.reinterpret_u8_u16x16(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x8 { self.simd.reinterpret_u32_u16x16(self) } } impl crate::SimdBase for u16x16 { const N: usize = 16; type Mask = mask16x16; type Block = u16x8; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[u16] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [u16] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[u16]) -> Self { let mut val = [0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: u16) -> Self { simd.splat_u16x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdInt for u16x16 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_eq_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_lt_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_le_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_ge_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_gt_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> u16x16 { self.simd.zip_low_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> u16x16 { self.simd.zip_high_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> u16x16 { self.simd.unzip_low_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> u16x16 { self.simd.unzip_high_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> u16x16 { self.simd.min_u16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> u16x16 { self.simd.max_u16x16(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct mask16x16 { pub val: [i16; 16], pub simd: S, } impl SimdFrom<[i16; 16], S> for mask16x16 { #[inline(always)] fn simd_from(val: [i16; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [i16; 16] { #[inline(always)] fn from(value: mask16x16) -> Self { value.val } } impl core::ops::Deref for mask16x16 { type Target = [i16; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask16x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask16x16 { #[inline(always)] fn simd_from(value: i16, simd: S) -> Self { simd.splat_mask16x16(value) } } impl Select> for mask16x16 { #[inline(always)] fn select(self, if_true: mask16x16, if_false: mask16x16) -> mask16x16 { self.simd.select_mask16x16(self, if_true, if_false) } } impl Bytes for mask16x16 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask16x16 { #[inline(always)] pub fn not(self) -> mask16x16 { self.simd.not_mask16x16(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask16x16 { self.simd.and_mask16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask16x16 { self.simd.or_mask16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask16x16 { self.simd.xor_mask16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_eq_mask16x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> mask16x32 { self.simd.combine_mask16x16(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask16x16 { const N: usize = 16; type Mask = mask16x16; type Block = mask16x8; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i16] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i16] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { let mut val = [0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i16) -> Self { simd.splat_mask16x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdMask for mask16x16 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask16x16 { self.simd.simd_eq_mask16x16(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct i32x8 { pub val: [i32; 8], pub simd: S, } impl SimdFrom<[i32; 8], S> for i32x8 { #[inline(always)] fn simd_from(val: [i32; 8], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], ], simd, } } } impl From> for [i32; 8] { #[inline(always)] fn from(value: i32x8) -> Self { value.val } } impl core::ops::Deref for i32x8 { type Target = [i32; 8]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for i32x8 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for i32x8 { #[inline(always)] fn simd_from(value: i32, simd: S) -> Self { simd.splat_i32x8(value) } } impl Select> for mask32x8 { #[inline(always)] fn select(self, if_true: i32x8, if_false: i32x8) -> i32x8 { self.simd.select_i32x8(self, if_true, if_false) } } impl Bytes for i32x8 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl i32x8 { #[inline(always)] pub fn not(self) -> i32x8 { self.simd.not_i32x8(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> i32x8 { self.simd.add_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> i32x8 { self.simd.sub_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> i32x8 { self.simd.mul_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> i32x8 { self.simd.and_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> i32x8 { self.simd.or_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> i32x8 { self.simd.xor_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> i32x8 { self.simd.shr_i32x8(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> i32x8 { self.simd.shrv_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> i32x8 { self.simd.shl_i32x8(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_eq_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_lt_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_le_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_ge_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_gt_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> i32x8 { self.simd.min_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> i32x8 { self.simd.max_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> i32x16 { self.simd.combine_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn neg(self) -> i32x8 { self.simd.neg_i32x8(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x32 { self.simd.reinterpret_u8_i32x8(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x8 { self.simd.reinterpret_u32_i32x8(self) } #[inline(always)] pub fn cvt_f32(self) -> f32x8 { self.simd.cvt_f32_i32x8(self) } } impl crate::SimdBase for i32x8 { const N: usize = 8; type Mask = mask32x8; type Block = i32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { let mut val = [0; 8]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i32) -> Self { simd.splat_i32x8(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdInt for i32x8 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_eq_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_lt_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_le_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_ge_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_gt_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> i32x8 { self.simd.zip_low_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> i32x8 { self.simd.zip_high_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> i32x8 { self.simd.unzip_low_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> i32x8 { self.simd.unzip_high_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> i32x8 { self.simd.min_i32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> i32x8 { self.simd.max_i32x8(self, rhs.simd_into(self.simd)) } } impl SimdCvtTruncate> for i32x8 { fn truncate_from(x: f32x8) -> Self { x.simd.cvt_i32_f32x8(x) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct u32x8 { pub val: [u32; 8], pub simd: S, } impl SimdFrom<[u32; 8], S> for u32x8 { #[inline(always)] fn simd_from(val: [u32; 8], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], ], simd, } } } impl From> for [u32; 8] { #[inline(always)] fn from(value: u32x8) -> Self { value.val } } impl core::ops::Deref for u32x8 { type Target = [u32; 8]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for u32x8 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for u32x8 { #[inline(always)] fn simd_from(value: u32, simd: S) -> Self { simd.splat_u32x8(value) } } impl Select> for mask32x8 { #[inline(always)] fn select(self, if_true: u32x8, if_false: u32x8) -> u32x8 { self.simd.select_u32x8(self, if_true, if_false) } } impl Bytes for u32x8 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl u32x8 { #[inline(always)] pub fn not(self) -> u32x8 { self.simd.not_u32x8(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> u32x8 { self.simd.add_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> u32x8 { self.simd.sub_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> u32x8 { self.simd.mul_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> u32x8 { self.simd.and_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> u32x8 { self.simd.or_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> u32x8 { self.simd.xor_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> u32x8 { self.simd.shr_u32x8(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> u32x8 { self.simd.shrv_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> u32x8 { self.simd.shl_u32x8(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_eq_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_lt_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_le_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_ge_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_gt_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> u32x8 { self.simd.min_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> u32x8 { self.simd.max_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> u32x16 { self.simd.combine_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x32 { self.simd.reinterpret_u8_u32x8(self) } #[inline(always)] pub fn cvt_f32(self) -> f32x8 { self.simd.cvt_f32_u32x8(self) } } impl crate::SimdBase for u32x8 { const N: usize = 8; type Mask = mask32x8; type Block = u32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[u32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [u32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[u32]) -> Self { let mut val = [0; 8]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: u32) -> Self { simd.splat_u32x8(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdInt for u32x8 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_eq_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_lt_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_le_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_ge_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_gt_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> u32x8 { self.simd.zip_low_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> u32x8 { self.simd.zip_high_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> u32x8 { self.simd.unzip_low_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> u32x8 { self.simd.unzip_high_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> u32x8 { self.simd.min_u32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> u32x8 { self.simd.max_u32x8(self, rhs.simd_into(self.simd)) } } impl SimdCvtTruncate> for u32x8 { fn truncate_from(x: f32x8) -> Self { x.simd.cvt_u32_f32x8(x) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct mask32x8 { pub val: [i32; 8], pub simd: S, } impl SimdFrom<[i32; 8], S> for mask32x8 { #[inline(always)] fn simd_from(val: [i32; 8], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], ], simd, } } } impl From> for [i32; 8] { #[inline(always)] fn from(value: mask32x8) -> Self { value.val } } impl core::ops::Deref for mask32x8 { type Target = [i32; 8]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask32x8 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask32x8 { #[inline(always)] fn simd_from(value: i32, simd: S) -> Self { simd.splat_mask32x8(value) } } impl Select> for mask32x8 { #[inline(always)] fn select(self, if_true: mask32x8, if_false: mask32x8) -> mask32x8 { self.simd.select_mask32x8(self, if_true, if_false) } } impl Bytes for mask32x8 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask32x8 { #[inline(always)] pub fn not(self) -> mask32x8 { self.simd.not_mask32x8(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask32x8 { self.simd.and_mask32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask32x8 { self.simd.or_mask32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask32x8 { self.simd.xor_mask32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_eq_mask32x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> mask32x16 { self.simd.combine_mask32x8(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask32x8 { const N: usize = 8; type Mask = mask32x8; type Block = mask32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { let mut val = [0; 8]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i32) -> Self { simd.splat_mask32x8(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdMask for mask32x8 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x8 { self.simd.simd_eq_mask32x8(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct f64x4 { pub val: [f64; 4], pub simd: S, } impl SimdFrom<[f64; 4], S> for f64x4 { #[inline(always)] fn simd_from(val: [f64; 4], simd: S) -> Self { Self { val: [val[0usize], val[1usize], val[2usize], val[3usize]], simd, } } } impl From> for [f64; 4] { #[inline(always)] fn from(value: f64x4) -> Self { value.val } } impl core::ops::Deref for f64x4 { type Target = [f64; 4]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for f64x4 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for f64x4 { #[inline(always)] fn simd_from(value: f64, simd: S) -> Self { simd.splat_f64x4(value) } } impl Select> for mask64x4 { #[inline(always)] fn select(self, if_true: f64x4, if_false: f64x4) -> f64x4 { self.simd.select_f64x4(self, if_true, if_false) } } impl Bytes for f64x4 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl f64x4 { #[inline(always)] pub fn abs(self) -> f64x4 { self.simd.abs_f64x4(self) } #[inline(always)] pub fn neg(self) -> f64x4 { self.simd.neg_f64x4(self) } #[inline(always)] pub fn sqrt(self) -> f64x4 { self.simd.sqrt_f64x4(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> f64x4 { self.simd.add_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> f64x4 { self.simd.sub_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> f64x4 { self.simd.mul_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn div(self, rhs: impl SimdInto) -> f64x4 { self.simd.div_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn copysign(self, rhs: impl SimdInto) -> f64x4 { self.simd.copysign_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_eq_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_lt_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_le_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_ge_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_gt_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> f64x4 { self.simd.max_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max_precise(self, rhs: impl SimdInto) -> f64x4 { self.simd.max_precise_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> f64x4 { self.simd.min_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min_precise(self, rhs: impl SimdInto) -> f64x4 { self.simd.min_precise_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn floor(self) -> f64x4 { self.simd.floor_f64x4(self) } #[inline(always)] pub fn fract(self) -> f64x4 { self.simd.fract_f64x4(self) } #[inline(always)] pub fn trunc(self) -> f64x4 { self.simd.trunc_f64x4(self) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> f64x8 { self.simd.combine_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_f32(self) -> f32x8 { self.simd.reinterpret_f32_f64x4(self) } } impl crate::SimdBase for f64x4 { const N: usize = 4; type Mask = mask64x4; type Block = f64x2; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[f64] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [f64] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[f64]) -> Self { let mut val = [0.0; 4]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: f64) -> Self { simd.splat_f64x4(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdFloat for f64x4 { #[inline(always)] fn abs(self) -> f64x4 { self.simd.abs_f64x4(self) } #[inline(always)] fn sqrt(self) -> f64x4 { self.simd.sqrt_f64x4(self) } #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> f64x4 { self.simd.copysign_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_eq_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_lt_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_le_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_ge_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_gt_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> f64x4 { self.simd.zip_low_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> f64x4 { self.simd.zip_high_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> f64x4 { self.simd.unzip_low_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> f64x4 { self.simd.unzip_high_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> f64x4 { self.simd.max_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max_precise(self, rhs: impl SimdInto) -> f64x4 { self.simd.max_precise_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> f64x4 { self.simd.min_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min_precise(self, rhs: impl SimdInto) -> f64x4 { self.simd.min_precise_f64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] fn madd(self, op1: impl SimdInto, op2: impl SimdInto) -> f64x4 { self.simd .madd_f64x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn msub(self, op1: impl SimdInto, op2: impl SimdInto) -> f64x4 { self.simd .msub_f64x4(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn floor(self) -> f64x4 { self.simd.floor_f64x4(self) } #[inline(always)] fn fract(self) -> f64x4 { self.simd.fract_f64x4(self) } #[inline(always)] fn trunc(self) -> f64x4 { self.simd.trunc_f64x4(self) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(32))] pub struct mask64x4 { pub val: [i64; 4], pub simd: S, } impl SimdFrom<[i64; 4], S> for mask64x4 { #[inline(always)] fn simd_from(val: [i64; 4], simd: S) -> Self { Self { val: [val[0usize], val[1usize], val[2usize], val[3usize]], simd, } } } impl From> for [i64; 4] { #[inline(always)] fn from(value: mask64x4) -> Self { value.val } } impl core::ops::Deref for mask64x4 { type Target = [i64; 4]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask64x4 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask64x4 { #[inline(always)] fn simd_from(value: i64, simd: S) -> Self { simd.splat_mask64x4(value) } } impl Select> for mask64x4 { #[inline(always)] fn select(self, if_true: mask64x4, if_false: mask64x4) -> mask64x4 { self.simd.select_mask64x4(self, if_true, if_false) } } impl Bytes for mask64x4 { type Bytes = u8x32; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x32 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask64x4 { #[inline(always)] pub fn not(self) -> mask64x4 { self.simd.not_mask64x4(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask64x4 { self.simd.and_mask64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask64x4 { self.simd.or_mask64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask64x4 { self.simd.xor_mask64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_eq_mask64x4(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn combine(self, rhs: impl SimdInto) -> mask64x8 { self.simd.combine_mask64x4(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask64x4 { const N: usize = 4; type Mask = mask64x4; type Block = mask64x2; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i64] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i64] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i64]) -> Self { let mut val = [0; 4]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i64) -> Self { simd.splat_mask64x4(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { block.combine(block) } } impl crate::SimdMask for mask64x4 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask64x4 { self.simd.simd_eq_mask64x4(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct f32x16 { pub val: [f32; 16], pub simd: S, } impl SimdFrom<[f32; 16], S> for f32x16 { #[inline(always)] fn simd_from(val: [f32; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [f32; 16] { #[inline(always)] fn from(value: f32x16) -> Self { value.val } } impl core::ops::Deref for f32x16 { type Target = [f32; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for f32x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for f32x16 { #[inline(always)] fn simd_from(value: f32, simd: S) -> Self { simd.splat_f32x16(value) } } impl Select> for mask32x16 { #[inline(always)] fn select(self, if_true: f32x16, if_false: f32x16) -> f32x16 { self.simd.select_f32x16(self, if_true, if_false) } } impl Bytes for f32x16 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl f32x16 { #[inline(always)] pub fn abs(self) -> f32x16 { self.simd.abs_f32x16(self) } #[inline(always)] pub fn neg(self) -> f32x16 { self.simd.neg_f32x16(self) } #[inline(always)] pub fn sqrt(self) -> f32x16 { self.simd.sqrt_f32x16(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> f32x16 { self.simd.add_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> f32x16 { self.simd.sub_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> f32x16 { self.simd.mul_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn div(self, rhs: impl SimdInto) -> f32x16 { self.simd.div_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn copysign(self, rhs: impl SimdInto) -> f32x16 { self.simd.copysign_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_eq_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_lt_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_le_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_ge_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_gt_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> f32x16 { self.simd.max_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max_precise(self, rhs: impl SimdInto) -> f32x16 { self.simd.max_precise_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> f32x16 { self.simd.min_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min_precise(self, rhs: impl SimdInto) -> f32x16 { self.simd.min_precise_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn floor(self) -> f32x16 { self.simd.floor_f32x16(self) } #[inline(always)] pub fn fract(self) -> f32x16 { self.simd.fract_f32x16(self) } #[inline(always)] pub fn trunc(self) -> f32x16 { self.simd.trunc_f32x16(self) } #[inline(always)] pub fn reinterpret_f64(self) -> f64x8 { self.simd.reinterpret_f64_f32x16(self) } #[inline(always)] pub fn reinterpret_i32(self) -> i32x16 { self.simd.reinterpret_i32_f32x16(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x64 { self.simd.reinterpret_u8_f32x16(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x16 { self.simd.reinterpret_u32_f32x16(self) } #[inline(always)] pub fn cvt_u32(self) -> u32x16 { self.simd.cvt_u32_f32x16(self) } #[inline(always)] pub fn cvt_i32(self) -> i32x16 { self.simd.cvt_i32_f32x16(self) } } impl crate::SimdBase for f32x16 { const N: usize = 16; type Mask = mask32x16; type Block = f32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[f32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [f32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[f32]) -> Self { let mut val = [0.0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: f32) -> Self { simd.splat_f32x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdFloat for f32x16 { #[inline(always)] fn abs(self) -> f32x16 { self.simd.abs_f32x16(self) } #[inline(always)] fn sqrt(self) -> f32x16 { self.simd.sqrt_f32x16(self) } #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> f32x16 { self.simd.copysign_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_eq_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_lt_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_le_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_ge_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_gt_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> f32x16 { self.simd.zip_low_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> f32x16 { self.simd.zip_high_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> f32x16 { self.simd.unzip_low_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> f32x16 { self.simd.unzip_high_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> f32x16 { self.simd.max_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max_precise(self, rhs: impl SimdInto) -> f32x16 { self.simd.max_precise_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> f32x16 { self.simd.min_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min_precise(self, rhs: impl SimdInto) -> f32x16 { self.simd.min_precise_f32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn madd(self, op1: impl SimdInto, op2: impl SimdInto) -> f32x16 { self.simd .madd_f32x16(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn msub(self, op1: impl SimdInto, op2: impl SimdInto) -> f32x16 { self.simd .msub_f32x16(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn floor(self) -> f32x16 { self.simd.floor_f32x16(self) } #[inline(always)] fn fract(self) -> f32x16 { self.simd.fract_f32x16(self) } #[inline(always)] fn trunc(self) -> f32x16 { self.simd.trunc_f32x16(self) } } impl SimdCvtFloat> for f32x16 { fn float_from(x: u32x16) -> Self { x.simd.cvt_f32_u32x16(x) } } impl SimdCvtFloat> for f32x16 { fn float_from(x: i32x16) -> Self { x.simd.cvt_f32_i32x16(x) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct i8x64 { pub val: [i8; 64], pub simd: S, } impl SimdFrom<[i8; 64], S> for i8x64 { #[inline(always)] fn simd_from(val: [i8; 64], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], val[16usize], val[17usize], val[18usize], val[19usize], val[20usize], val[21usize], val[22usize], val[23usize], val[24usize], val[25usize], val[26usize], val[27usize], val[28usize], val[29usize], val[30usize], val[31usize], val[32usize], val[33usize], val[34usize], val[35usize], val[36usize], val[37usize], val[38usize], val[39usize], val[40usize], val[41usize], val[42usize], val[43usize], val[44usize], val[45usize], val[46usize], val[47usize], val[48usize], val[49usize], val[50usize], val[51usize], val[52usize], val[53usize], val[54usize], val[55usize], val[56usize], val[57usize], val[58usize], val[59usize], val[60usize], val[61usize], val[62usize], val[63usize], ], simd, } } } impl From> for [i8; 64] { #[inline(always)] fn from(value: i8x64) -> Self { value.val } } impl core::ops::Deref for i8x64 { type Target = [i8; 64]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for i8x64 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for i8x64 { #[inline(always)] fn simd_from(value: i8, simd: S) -> Self { simd.splat_i8x64(value) } } impl Select> for mask8x64 { #[inline(always)] fn select(self, if_true: i8x64, if_false: i8x64) -> i8x64 { self.simd.select_i8x64(self, if_true, if_false) } } impl Bytes for i8x64 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl i8x64 { #[inline(always)] pub fn not(self) -> i8x64 { self.simd.not_i8x64(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> i8x64 { self.simd.add_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> i8x64 { self.simd.sub_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> i8x64 { self.simd.mul_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> i8x64 { self.simd.and_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> i8x64 { self.simd.or_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> i8x64 { self.simd.xor_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> i8x64 { self.simd.shr_i8x64(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> i8x64 { self.simd.shrv_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> i8x64 { self.simd.shl_i8x64(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_eq_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_lt_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_le_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_ge_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_gt_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> i8x64 { self.simd.min_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> i8x64 { self.simd.max_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn neg(self) -> i8x64 { self.simd.neg_i8x64(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x64 { self.simd.reinterpret_u8_i8x64(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x16 { self.simd.reinterpret_u32_i8x64(self) } } impl crate::SimdBase for i8x64 { const N: usize = 64; type Mask = mask8x64; type Block = i8x16; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i8] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i8] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { let mut val = [0; 64]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i8) -> Self { simd.splat_i8x64(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdInt for i8x64 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_eq_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_lt_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_le_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_ge_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_gt_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> i8x64 { self.simd.zip_low_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> i8x64 { self.simd.zip_high_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> i8x64 { self.simd.unzip_low_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> i8x64 { self.simd.unzip_high_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> i8x64 { self.simd.min_i8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> i8x64 { self.simd.max_i8x64(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct u8x64 { pub val: [u8; 64], pub simd: S, } impl SimdFrom<[u8; 64], S> for u8x64 { #[inline(always)] fn simd_from(val: [u8; 64], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], val[16usize], val[17usize], val[18usize], val[19usize], val[20usize], val[21usize], val[22usize], val[23usize], val[24usize], val[25usize], val[26usize], val[27usize], val[28usize], val[29usize], val[30usize], val[31usize], val[32usize], val[33usize], val[34usize], val[35usize], val[36usize], val[37usize], val[38usize], val[39usize], val[40usize], val[41usize], val[42usize], val[43usize], val[44usize], val[45usize], val[46usize], val[47usize], val[48usize], val[49usize], val[50usize], val[51usize], val[52usize], val[53usize], val[54usize], val[55usize], val[56usize], val[57usize], val[58usize], val[59usize], val[60usize], val[61usize], val[62usize], val[63usize], ], simd, } } } impl From> for [u8; 64] { #[inline(always)] fn from(value: u8x64) -> Self { value.val } } impl core::ops::Deref for u8x64 { type Target = [u8; 64]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for u8x64 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for u8x64 { #[inline(always)] fn simd_from(value: u8, simd: S) -> Self { simd.splat_u8x64(value) } } impl Select> for mask8x64 { #[inline(always)] fn select(self, if_true: u8x64, if_false: u8x64) -> u8x64 { self.simd.select_u8x64(self, if_true, if_false) } } impl Bytes for u8x64 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl u8x64 { #[inline(always)] pub fn not(self) -> u8x64 { self.simd.not_u8x64(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> u8x64 { self.simd.add_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> u8x64 { self.simd.sub_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> u8x64 { self.simd.mul_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> u8x64 { self.simd.and_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> u8x64 { self.simd.or_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> u8x64 { self.simd.xor_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> u8x64 { self.simd.shr_u8x64(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> u8x64 { self.simd.shrv_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> u8x64 { self.simd.shl_u8x64(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_eq_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_lt_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_le_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_ge_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_gt_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> u8x64 { self.simd.min_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> u8x64 { self.simd.max_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x16 { self.simd.reinterpret_u32_u8x64(self) } } impl crate::SimdBase for u8x64 { const N: usize = 64; type Mask = mask8x64; type Block = u8x16; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[u8] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [u8] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[u8]) -> Self { let mut val = [0; 64]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: u8) -> Self { simd.splat_u8x64(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdInt for u8x64 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_eq_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_lt_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_le_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_ge_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_gt_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> u8x64 { self.simd.zip_low_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> u8x64 { self.simd.zip_high_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> u8x64 { self.simd.unzip_low_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> u8x64 { self.simd.unzip_high_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> u8x64 { self.simd.min_u8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> u8x64 { self.simd.max_u8x64(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct mask8x64 { pub val: [i8; 64], pub simd: S, } impl SimdFrom<[i8; 64], S> for mask8x64 { #[inline(always)] fn simd_from(val: [i8; 64], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], val[16usize], val[17usize], val[18usize], val[19usize], val[20usize], val[21usize], val[22usize], val[23usize], val[24usize], val[25usize], val[26usize], val[27usize], val[28usize], val[29usize], val[30usize], val[31usize], val[32usize], val[33usize], val[34usize], val[35usize], val[36usize], val[37usize], val[38usize], val[39usize], val[40usize], val[41usize], val[42usize], val[43usize], val[44usize], val[45usize], val[46usize], val[47usize], val[48usize], val[49usize], val[50usize], val[51usize], val[52usize], val[53usize], val[54usize], val[55usize], val[56usize], val[57usize], val[58usize], val[59usize], val[60usize], val[61usize], val[62usize], val[63usize], ], simd, } } } impl From> for [i8; 64] { #[inline(always)] fn from(value: mask8x64) -> Self { value.val } } impl core::ops::Deref for mask8x64 { type Target = [i8; 64]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask8x64 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask8x64 { #[inline(always)] fn simd_from(value: i8, simd: S) -> Self { simd.splat_mask8x64(value) } } impl Select> for mask8x64 { #[inline(always)] fn select(self, if_true: mask8x64, if_false: mask8x64) -> mask8x64 { self.simd.select_mask8x64(self, if_true, if_false) } } impl Bytes for mask8x64 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask8x64 { #[inline(always)] pub fn not(self) -> mask8x64 { self.simd.not_mask8x64(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask8x64 { self.simd.and_mask8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask8x64 { self.simd.or_mask8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask8x64 { self.simd.xor_mask8x64(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_eq_mask8x64(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask8x64 { const N: usize = 64; type Mask = mask8x64; type Block = mask8x16; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i8] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i8] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i8]) -> Self { let mut val = [0; 64]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i8) -> Self { simd.splat_mask8x64(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdMask for mask8x64 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask8x64 { self.simd.simd_eq_mask8x64(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct i16x32 { pub val: [i16; 32], pub simd: S, } impl SimdFrom<[i16; 32], S> for i16x32 { #[inline(always)] fn simd_from(val: [i16; 32], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], val[16usize], val[17usize], val[18usize], val[19usize], val[20usize], val[21usize], val[22usize], val[23usize], val[24usize], val[25usize], val[26usize], val[27usize], val[28usize], val[29usize], val[30usize], val[31usize], ], simd, } } } impl From> for [i16; 32] { #[inline(always)] fn from(value: i16x32) -> Self { value.val } } impl core::ops::Deref for i16x32 { type Target = [i16; 32]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for i16x32 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for i16x32 { #[inline(always)] fn simd_from(value: i16, simd: S) -> Self { simd.splat_i16x32(value) } } impl Select> for mask16x32 { #[inline(always)] fn select(self, if_true: i16x32, if_false: i16x32) -> i16x32 { self.simd.select_i16x32(self, if_true, if_false) } } impl Bytes for i16x32 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl i16x32 { #[inline(always)] pub fn not(self) -> i16x32 { self.simd.not_i16x32(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> i16x32 { self.simd.add_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> i16x32 { self.simd.sub_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> i16x32 { self.simd.mul_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> i16x32 { self.simd.and_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> i16x32 { self.simd.or_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> i16x32 { self.simd.xor_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> i16x32 { self.simd.shr_i16x32(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> i16x32 { self.simd.shrv_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> i16x32 { self.simd.shl_i16x32(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_eq_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_lt_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_le_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_ge_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_gt_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> i16x32 { self.simd.min_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> i16x32 { self.simd.max_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn neg(self) -> i16x32 { self.simd.neg_i16x32(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x64 { self.simd.reinterpret_u8_i16x32(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x16 { self.simd.reinterpret_u32_i16x32(self) } } impl crate::SimdBase for i16x32 { const N: usize = 32; type Mask = mask16x32; type Block = i16x8; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i16] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i16] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { let mut val = [0; 32]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i16) -> Self { simd.splat_i16x32(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdInt for i16x32 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_eq_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_lt_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_le_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_ge_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_gt_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> i16x32 { self.simd.zip_low_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> i16x32 { self.simd.zip_high_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> i16x32 { self.simd.unzip_low_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> i16x32 { self.simd.unzip_high_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> i16x32 { self.simd.min_i16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> i16x32 { self.simd.max_i16x32(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct u16x32 { pub val: [u16; 32], pub simd: S, } impl SimdFrom<[u16; 32], S> for u16x32 { #[inline(always)] fn simd_from(val: [u16; 32], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], val[16usize], val[17usize], val[18usize], val[19usize], val[20usize], val[21usize], val[22usize], val[23usize], val[24usize], val[25usize], val[26usize], val[27usize], val[28usize], val[29usize], val[30usize], val[31usize], ], simd, } } } impl From> for [u16; 32] { #[inline(always)] fn from(value: u16x32) -> Self { value.val } } impl core::ops::Deref for u16x32 { type Target = [u16; 32]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for u16x32 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for u16x32 { #[inline(always)] fn simd_from(value: u16, simd: S) -> Self { simd.splat_u16x32(value) } } impl Select> for mask16x32 { #[inline(always)] fn select(self, if_true: u16x32, if_false: u16x32) -> u16x32 { self.simd.select_u16x32(self, if_true, if_false) } } impl Bytes for u16x32 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl u16x32 { #[inline(always)] pub fn not(self) -> u16x32 { self.simd.not_u16x32(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> u16x32 { self.simd.add_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> u16x32 { self.simd.sub_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> u16x32 { self.simd.mul_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> u16x32 { self.simd.and_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> u16x32 { self.simd.or_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> u16x32 { self.simd.xor_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> u16x32 { self.simd.shr_u16x32(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> u16x32 { self.simd.shrv_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> u16x32 { self.simd.shl_u16x32(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_eq_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_lt_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_le_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_ge_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_gt_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> u16x32 { self.simd.min_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> u16x32 { self.simd.max_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x64 { self.simd.reinterpret_u8_u16x32(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x16 { self.simd.reinterpret_u32_u16x32(self) } } impl crate::SimdBase for u16x32 { const N: usize = 32; type Mask = mask16x32; type Block = u16x8; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[u16] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [u16] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[u16]) -> Self { let mut val = [0; 32]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: u16) -> Self { simd.splat_u16x32(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdInt for u16x32 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_eq_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_lt_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_le_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_ge_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_gt_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> u16x32 { self.simd.zip_low_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> u16x32 { self.simd.zip_high_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> u16x32 { self.simd.unzip_low_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> u16x32 { self.simd.unzip_high_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> u16x32 { self.simd.min_u16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> u16x32 { self.simd.max_u16x32(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct mask16x32 { pub val: [i16; 32], pub simd: S, } impl SimdFrom<[i16; 32], S> for mask16x32 { #[inline(always)] fn simd_from(val: [i16; 32], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], val[16usize], val[17usize], val[18usize], val[19usize], val[20usize], val[21usize], val[22usize], val[23usize], val[24usize], val[25usize], val[26usize], val[27usize], val[28usize], val[29usize], val[30usize], val[31usize], ], simd, } } } impl From> for [i16; 32] { #[inline(always)] fn from(value: mask16x32) -> Self { value.val } } impl core::ops::Deref for mask16x32 { type Target = [i16; 32]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask16x32 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask16x32 { #[inline(always)] fn simd_from(value: i16, simd: S) -> Self { simd.splat_mask16x32(value) } } impl Select> for mask16x32 { #[inline(always)] fn select(self, if_true: mask16x32, if_false: mask16x32) -> mask16x32 { self.simd.select_mask16x32(self, if_true, if_false) } } impl Bytes for mask16x32 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask16x32 { #[inline(always)] pub fn not(self) -> mask16x32 { self.simd.not_mask16x32(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask16x32 { self.simd.and_mask16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask16x32 { self.simd.or_mask16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask16x32 { self.simd.xor_mask16x32(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_eq_mask16x32(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask16x32 { const N: usize = 32; type Mask = mask16x32; type Block = mask16x8; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i16] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i16] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i16]) -> Self { let mut val = [0; 32]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i16) -> Self { simd.splat_mask16x32(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdMask for mask16x32 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask16x32 { self.simd.simd_eq_mask16x32(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct i32x16 { pub val: [i32; 16], pub simd: S, } impl SimdFrom<[i32; 16], S> for i32x16 { #[inline(always)] fn simd_from(val: [i32; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [i32; 16] { #[inline(always)] fn from(value: i32x16) -> Self { value.val } } impl core::ops::Deref for i32x16 { type Target = [i32; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for i32x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for i32x16 { #[inline(always)] fn simd_from(value: i32, simd: S) -> Self { simd.splat_i32x16(value) } } impl Select> for mask32x16 { #[inline(always)] fn select(self, if_true: i32x16, if_false: i32x16) -> i32x16 { self.simd.select_i32x16(self, if_true, if_false) } } impl Bytes for i32x16 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl i32x16 { #[inline(always)] pub fn not(self) -> i32x16 { self.simd.not_i32x16(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> i32x16 { self.simd.add_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> i32x16 { self.simd.sub_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> i32x16 { self.simd.mul_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> i32x16 { self.simd.and_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> i32x16 { self.simd.or_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> i32x16 { self.simd.xor_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> i32x16 { self.simd.shr_i32x16(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> i32x16 { self.simd.shrv_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> i32x16 { self.simd.shl_i32x16(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_eq_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_lt_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_le_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_ge_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_gt_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> i32x16 { self.simd.min_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> i32x16 { self.simd.max_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn neg(self) -> i32x16 { self.simd.neg_i32x16(self) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x64 { self.simd.reinterpret_u8_i32x16(self) } #[inline(always)] pub fn reinterpret_u32(self) -> u32x16 { self.simd.reinterpret_u32_i32x16(self) } #[inline(always)] pub fn cvt_f32(self) -> f32x16 { self.simd.cvt_f32_i32x16(self) } } impl crate::SimdBase for i32x16 { const N: usize = 16; type Mask = mask32x16; type Block = i32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { let mut val = [0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i32) -> Self { simd.splat_i32x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdInt for i32x16 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_eq_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_lt_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_le_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_ge_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_gt_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> i32x16 { self.simd.zip_low_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> i32x16 { self.simd.zip_high_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> i32x16 { self.simd.unzip_low_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> i32x16 { self.simd.unzip_high_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> i32x16 { self.simd.min_i32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> i32x16 { self.simd.max_i32x16(self, rhs.simd_into(self.simd)) } } impl SimdCvtTruncate> for i32x16 { fn truncate_from(x: f32x16) -> Self { x.simd.cvt_i32_f32x16(x) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct u32x16 { pub val: [u32; 16], pub simd: S, } impl SimdFrom<[u32; 16], S> for u32x16 { #[inline(always)] fn simd_from(val: [u32; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [u32; 16] { #[inline(always)] fn from(value: u32x16) -> Self { value.val } } impl core::ops::Deref for u32x16 { type Target = [u32; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for u32x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for u32x16 { #[inline(always)] fn simd_from(value: u32, simd: S) -> Self { simd.splat_u32x16(value) } } impl Select> for mask32x16 { #[inline(always)] fn select(self, if_true: u32x16, if_false: u32x16) -> u32x16 { self.simd.select_u32x16(self, if_true, if_false) } } impl Bytes for u32x16 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl u32x16 { #[inline(always)] pub fn not(self) -> u32x16 { self.simd.not_u32x16(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> u32x16 { self.simd.add_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> u32x16 { self.simd.sub_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> u32x16 { self.simd.mul_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> u32x16 { self.simd.and_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> u32x16 { self.simd.or_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> u32x16 { self.simd.xor_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shr(self, shift: u32) -> u32x16 { self.simd.shr_u32x16(self, shift) } #[inline(always)] pub fn shrv(self, rhs: impl SimdInto) -> u32x16 { self.simd.shrv_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn shl(self, shift: u32) -> u32x16 { self.simd.shl_u32x16(self, shift) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_eq_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_lt_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_le_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_ge_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_gt_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> u32x16 { self.simd.min_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> u32x16 { self.simd.max_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn reinterpret_u8(self) -> u8x64 { self.simd.reinterpret_u8_u32x16(self) } #[inline(always)] pub fn cvt_f32(self) -> f32x16 { self.simd.cvt_f32_u32x16(self) } } impl crate::SimdBase for u32x16 { const N: usize = 16; type Mask = mask32x16; type Block = u32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[u32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [u32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[u32]) -> Self { let mut val = [0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: u32) -> Self { simd.splat_u32x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdInt for u32x16 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_eq_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_lt_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_le_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_ge_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_gt_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> u32x16 { self.simd.zip_low_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> u32x16 { self.simd.zip_high_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> u32x16 { self.simd.unzip_low_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> u32x16 { self.simd.unzip_high_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> u32x16 { self.simd.min_u32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> u32x16 { self.simd.max_u32x16(self, rhs.simd_into(self.simd)) } } impl SimdCvtTruncate> for u32x16 { fn truncate_from(x: f32x16) -> Self { x.simd.cvt_u32_f32x16(x) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct mask32x16 { pub val: [i32; 16], pub simd: S, } impl SimdFrom<[i32; 16], S> for mask32x16 { #[inline(always)] fn simd_from(val: [i32; 16], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], val[8usize], val[9usize], val[10usize], val[11usize], val[12usize], val[13usize], val[14usize], val[15usize], ], simd, } } } impl From> for [i32; 16] { #[inline(always)] fn from(value: mask32x16) -> Self { value.val } } impl core::ops::Deref for mask32x16 { type Target = [i32; 16]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask32x16 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask32x16 { #[inline(always)] fn simd_from(value: i32, simd: S) -> Self { simd.splat_mask32x16(value) } } impl Select> for mask32x16 { #[inline(always)] fn select(self, if_true: mask32x16, if_false: mask32x16) -> mask32x16 { self.simd.select_mask32x16(self, if_true, if_false) } } impl Bytes for mask32x16 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask32x16 { #[inline(always)] pub fn not(self) -> mask32x16 { self.simd.not_mask32x16(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask32x16 { self.simd.and_mask32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask32x16 { self.simd.or_mask32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask32x16 { self.simd.xor_mask32x16(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_eq_mask32x16(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask32x16 { const N: usize = 16; type Mask = mask32x16; type Block = mask32x4; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i32] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i32] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i32]) -> Self { let mut val = [0; 16]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i32) -> Self { simd.splat_mask32x16(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdMask for mask32x16 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask32x16 { self.simd.simd_eq_mask32x16(self, rhs.simd_into(self.simd)) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct f64x8 { pub val: [f64; 8], pub simd: S, } impl SimdFrom<[f64; 8], S> for f64x8 { #[inline(always)] fn simd_from(val: [f64; 8], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], ], simd, } } } impl From> for [f64; 8] { #[inline(always)] fn from(value: f64x8) -> Self { value.val } } impl core::ops::Deref for f64x8 { type Target = [f64; 8]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for f64x8 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for f64x8 { #[inline(always)] fn simd_from(value: f64, simd: S) -> Self { simd.splat_f64x8(value) } } impl Select> for mask64x8 { #[inline(always)] fn select(self, if_true: f64x8, if_false: f64x8) -> f64x8 { self.simd.select_f64x8(self, if_true, if_false) } } impl Bytes for f64x8 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl f64x8 { #[inline(always)] pub fn abs(self) -> f64x8 { self.simd.abs_f64x8(self) } #[inline(always)] pub fn neg(self) -> f64x8 { self.simd.neg_f64x8(self) } #[inline(always)] pub fn sqrt(self) -> f64x8 { self.simd.sqrt_f64x8(self) } #[inline(always)] pub fn add(self, rhs: impl SimdInto) -> f64x8 { self.simd.add_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn sub(self, rhs: impl SimdInto) -> f64x8 { self.simd.sub_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn mul(self, rhs: impl SimdInto) -> f64x8 { self.simd.mul_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn div(self, rhs: impl SimdInto) -> f64x8 { self.simd.div_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn copysign(self, rhs: impl SimdInto) -> f64x8 { self.simd.copysign_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_eq_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_lt(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_lt_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_le(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_le_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_ge(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_ge_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_gt(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_gt_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max(self, rhs: impl SimdInto) -> f64x8 { self.simd.max_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn max_precise(self, rhs: impl SimdInto) -> f64x8 { self.simd.max_precise_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min(self, rhs: impl SimdInto) -> f64x8 { self.simd.min_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn min_precise(self, rhs: impl SimdInto) -> f64x8 { self.simd.min_precise_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn floor(self) -> f64x8 { self.simd.floor_f64x8(self) } #[inline(always)] pub fn fract(self) -> f64x8 { self.simd.fract_f64x8(self) } #[inline(always)] pub fn trunc(self) -> f64x8 { self.simd.trunc_f64x8(self) } #[inline(always)] pub fn reinterpret_f32(self) -> f32x16 { self.simd.reinterpret_f32_f64x8(self) } } impl crate::SimdBase for f64x8 { const N: usize = 8; type Mask = mask64x8; type Block = f64x2; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[f64] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [f64] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[f64]) -> Self { let mut val = [0.0; 8]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: f64) -> Self { simd.splat_f64x8(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdFloat for f64x8 { #[inline(always)] fn abs(self) -> f64x8 { self.simd.abs_f64x8(self) } #[inline(always)] fn sqrt(self) -> f64x8 { self.simd.sqrt_f64x8(self) } #[inline(always)] fn copysign(self, rhs: impl SimdInto) -> f64x8 { self.simd.copysign_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_eq_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_lt(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_lt_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_le(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_le_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_ge(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_ge_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn simd_gt(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_gt_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_low(self, rhs: impl SimdInto) -> f64x8 { self.simd.zip_low_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn zip_high(self, rhs: impl SimdInto) -> f64x8 { self.simd.zip_high_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_low(self, rhs: impl SimdInto) -> f64x8 { self.simd.unzip_low_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn unzip_high(self, rhs: impl SimdInto) -> f64x8 { self.simd.unzip_high_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max(self, rhs: impl SimdInto) -> f64x8 { self.simd.max_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn max_precise(self, rhs: impl SimdInto) -> f64x8 { self.simd.max_precise_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min(self, rhs: impl SimdInto) -> f64x8 { self.simd.min_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn min_precise(self, rhs: impl SimdInto) -> f64x8 { self.simd.min_precise_f64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] fn madd(self, op1: impl SimdInto, op2: impl SimdInto) -> f64x8 { self.simd .madd_f64x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn msub(self, op1: impl SimdInto, op2: impl SimdInto) -> f64x8 { self.simd .msub_f64x8(self, op1.simd_into(self.simd), op2.simd_into(self.simd)) } #[inline(always)] fn floor(self) -> f64x8 { self.simd.floor_f64x8(self) } #[inline(always)] fn fract(self) -> f64x8 { self.simd.fract_f64x8(self) } #[inline(always)] fn trunc(self) -> f64x8 { self.simd.trunc_f64x8(self) } } #[derive(Clone, Copy, Debug)] #[repr(C, align(64))] pub struct mask64x8 { pub val: [i64; 8], pub simd: S, } impl SimdFrom<[i64; 8], S> for mask64x8 { #[inline(always)] fn simd_from(val: [i64; 8], simd: S) -> Self { Self { val: [ val[0usize], val[1usize], val[2usize], val[3usize], val[4usize], val[5usize], val[6usize], val[7usize], ], simd, } } } impl From> for [i64; 8] { #[inline(always)] fn from(value: mask64x8) -> Self { value.val } } impl core::ops::Deref for mask64x8 { type Target = [i64; 8]; #[inline(always)] fn deref(&self) -> &Self::Target { &self.val } } impl core::ops::DerefMut for mask64x8 { #[inline(always)] fn deref_mut(&mut self) -> &mut Self::Target { &mut self.val } } impl SimdFrom for mask64x8 { #[inline(always)] fn simd_from(value: i64, simd: S) -> Self { simd.splat_mask64x8(value) } } impl Select> for mask64x8 { #[inline(always)] fn select(self, if_true: mask64x8, if_false: mask64x8) -> mask64x8 { self.simd.select_mask64x8(self, if_true, if_false) } } impl Bytes for mask64x8 { type Bytes = u8x64; #[inline(always)] fn to_bytes(self) -> Self::Bytes { unsafe { u8x64 { val: core::mem::transmute(self.val), simd: self.simd, } } } #[inline(always)] fn from_bytes(value: Self::Bytes) -> Self { unsafe { Self { val: core::mem::transmute(value.val), simd: value.simd, } } } } impl mask64x8 { #[inline(always)] pub fn not(self) -> mask64x8 { self.simd.not_mask64x8(self) } #[inline(always)] pub fn and(self, rhs: impl SimdInto) -> mask64x8 { self.simd.and_mask64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn or(self, rhs: impl SimdInto) -> mask64x8 { self.simd.or_mask64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn xor(self, rhs: impl SimdInto) -> mask64x8 { self.simd.xor_mask64x8(self, rhs.simd_into(self.simd)) } #[inline(always)] pub fn simd_eq(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_eq_mask64x8(self, rhs.simd_into(self.simd)) } } impl crate::SimdBase for mask64x8 { const N: usize = 8; type Mask = mask64x8; type Block = mask64x2; #[inline(always)] fn witness(&self) -> S { self.simd } #[inline(always)] fn as_slice(&self) -> &[i64] { &self.val } #[inline(always)] fn as_mut_slice(&mut self) -> &mut [i64] { &mut self.val } #[inline(always)] fn from_slice(simd: S, slice: &[i64]) -> Self { let mut val = [0; 8]; val.copy_from_slice(slice); Self { val, simd } } #[inline(always)] fn splat(simd: S, val: i64) -> Self { simd.splat_mask64x8(val) } #[inline(always)] fn block_splat(block: Self::Block) -> Self { let block2 = block.combine(block); block2.combine(block2) } } impl crate::SimdMask for mask64x8 { #[inline(always)] fn simd_eq(self, rhs: impl SimdInto) -> mask64x8 { self.simd.simd_eq_mask64x8(self, rhs.simd_into(self.simd)) } } fearless_simd-0.3.0/src/generated/sse4_2.rs000064400000000000000000006166661046102023000166630ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT // This file is autogenerated by fearless_simd_gen #![expect( unused_variables, clippy::todo, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, }; #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use core::ops::*; #[doc = r#" The SIMD token for the "SSE 4.2" level."#] #[derive(Clone, Copy, Debug)] pub struct Sse4_2 { pub sse4_2: crate::core_arch::x86::Sse4_2, } impl Sse4_2 { #[doc = r" Create a SIMD token."] #[doc = r""] #[doc = r" # Safety"] #[doc = r""] #[doc = r" The SSE4.2 CPU feature must be available."] #[inline] pub const unsafe fn new_unchecked() -> Self { Sse4_2 { sse4_2: unsafe { crate::core_arch::x86::Sse4_2::new_unchecked() }, } } } impl Seal for Sse4_2 {} impl Simd for Sse4_2 { type f32s = f32x4; type u8s = u8x16; type i8s = i8x16; type u16s = u16x8; type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; #[inline(always)] fn level(self) -> Level { Level::Sse4_2(self) } #[inline] fn vectorize R, R>(self, f: F) -> R { #[target_feature(enable = "sse4.2")] #[inline] unsafe fn vectorize_sse4_2 R, R>(f: F) -> R { f() } unsafe { vectorize_sse4_2(f) } } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { unsafe { _mm_set1_ps(val).simd_into(self) } } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_xor_ps(a.into(), _mm_set1_ps(-0.0)).simd_into(self) } } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_sqrt_ps(a.into()).simd_into(self) } } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_add_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_sub_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_mul_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_div_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { let mask = _mm_set1_ps(-0.0); _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, a.into())).simd_into(self) } } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmpeq_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmplt_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmple_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmpge_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { unsafe { _mm_castps_si128(_mm_cmpgt_ps(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_unpacklo_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_unpackhi_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_shuffle_ps::<0b10_00_10_00>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_shuffle_ps::<0b11_01_11_01>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_max_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { unsafe { _mm_min_ps(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn madd_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { a * b + c } #[inline(always)] fn msub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { a * b - c } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_floor_ps(a.into()).simd_into(self) } } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { a - a.trunc() } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_round_ps(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) } } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { unsafe { let mask = _mm_castsi128_ps(a.into()); _mm_or_ps(_mm_and_ps(mask, b.into()), _mm_andnot_ps(mask, c.into())).simd_into(self) } } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { f64x2 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { i32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { unsafe { _mm_cvtps_epi32(_mm_max_ps(_mm_floor_ps(a.into()), _mm_set1_ps(0.0))).simd_into(self) } } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { unsafe { _mm_cvtps_epi32(a.trunc().into()).simd_into(self) } } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { unsafe { _mm_set1_epi8(val).simd_into(self) } } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { a ^ !0 } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { todo!() } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { unsafe { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift as i32); let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128())); let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128())); let lo_shifted = _mm_sra_epi16(lo_16, shift_count); let hi_shifted = _mm_sra_epi16(hi_16, shift_count); _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) } } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { unsafe { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift as i32); let lo_16 = _mm_unpacklo_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128())); let hi_16 = _mm_unpackhi_epi8(val, _mm_cmplt_epi8(val, _mm_setzero_si128())); let lo_shifted = _mm_sll_epi16(lo_16, shift_count); let hi_shifted = _mm_sll_epi16(hi_16, shift_count); _mm_packs_epi16(lo_shifted, hi_shifted).simd_into(self) } } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmplt_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(_mm_min_epi8(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(_mm_max_epi8(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { unsafe { _mm_cmpgt_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_min_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_max_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { unsafe { _mm_sub_epi8(_mm_setzero_si128(), a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { unsafe { _mm_set1_epi8(val as _).simd_into(self) } } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { a ^ !0 } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_sub_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { todo!() } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { unsafe { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift as i32); let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); let lo_shifted = _mm_srl_epi16(lo_16, shift_count); let hi_shifted = _mm_srl_epi16(hi_16, shift_count); _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) } } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { unsafe { let val = a.into(); let shift_count = _mm_cvtsi32_si128(shift as i32); let lo_16 = _mm_unpacklo_epi8(val, _mm_setzero_si128()); let hi_16 = _mm_unpackhi_epi8(val, _mm_setzero_si128()); let lo_shifted = _mm_sll_epi16(lo_16, shift_count); let hi_shifted = _mm_sll_epi16(hi_16, shift_count); _mm_packus_epi16(lo_shifted, hi_shifted).simd_into(self) } } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { let sign_bit = _mm_set1_epi8(0x80u8 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { let sign_bit = _mm_set1_epi8(0x80u8 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi8(b_signed, a_signed).simd_into(self) } } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(_mm_min_epu8(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(_mm_max_epu8(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { unsafe { let sign_bit = _mm_set1_epi8(0x80u8 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi8(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_unpacklo_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_unpackhi_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { let mask = _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { let mask = _mm_setr_epi8(1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_min_epu8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_max_epu8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { unsafe { let raw = a.into(); let high = _mm_cvtepu8_epi16(raw).simd_into(self); let low = _mm_cvtepu8_epi16(_mm_srli_si128::<8>(raw)).simd_into(self); self.combine_u16x8(high, low) } } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_mask8x16(self, val: i8) -> mask8x16 { unsafe { _mm_set1_epi8(val).simd_into(self) } } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { a ^ !0 } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask8x16( self, a: mask8x16, b: mask8x16, c: mask8x16, ) -> mask8x16 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_cmpeq_epi8(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { unsafe { _mm_set1_epi16(val).simd_into(self) } } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { a ^ !0 } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { unsafe { _mm_sra_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmplt_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(_mm_min_epi16(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(_mm_max_epi16(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { unsafe { _mm_cmpgt_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_min_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_max_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { unsafe { _mm_sub_epi16(_mm_setzero_si128(), a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { unsafe { _mm_set1_epi16(val as _).simd_into(self) } } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { a ^ !0 } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_sub_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_mullo_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { unsafe { _mm_srl_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { unsafe { _mm_sll_epi16(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { let sign_bit = _mm_set1_epi16(0x8000u16 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { let sign_bit = _mm_set1_epi16(0x8000u16 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi16(b_signed, a_signed).simd_into(self) } } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(_mm_min_epu16(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(_mm_max_epu16(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { unsafe { let sign_bit = _mm_set1_epi16(0x8000u16 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi16(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_unpacklo_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_unpackhi_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { let mask = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15); let t1 = _mm_shuffle_epi8(a.into(), mask); let t2 = _mm_shuffle_epi8(b.into(), mask); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_min_epu16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_max_epu16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_mask16x8(self, val: i16) -> mask16x8 { unsafe { _mm_set1_epi16(val).simd_into(self) } } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { a ^ !0 } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask16x8( self, a: mask16x8, b: mask16x8, c: mask16x8, ) -> mask16x8 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_cmpeq_epi16(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { unsafe { _mm_set1_epi32(val).simd_into(self) } } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { a ^ !0 } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { unsafe { _mm_sra_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmplt_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(_mm_min_epi32(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(_mm_max_epi32(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { unsafe { _mm_cmpgt_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); _mm_unpackhi_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_min_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_max_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { unsafe { _mm_sub_epi32(_mm_setzero_si128(), a.into()).simd_into(self) } } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { u32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { unsafe { _mm_set1_epi32(val as _).simd_into(self) } } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { a ^ !0 } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_sub_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_mullo_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { unsafe { _mm_srl_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { unsafe { _mm_sll_epi32(a.into(), _mm_cvtsi32_si128(shift as _)).simd_into(self) } } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { let sign_bit = _mm_set1_epi32(0x80000000u32 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { let sign_bit = _mm_set1_epi32(0x80000000u32 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi32(b_signed, a_signed).simd_into(self) } } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(_mm_min_epu32(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(_mm_max_epu32(a.into(), b.into()), a.into()).simd_into(self) } } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { unsafe { let sign_bit = _mm_set1_epi32(0x80000000u32 as _); let a_signed = _mm_xor_si128(a.into(), sign_bit); let b_signed = _mm_xor_si128(b.into(), sign_bit); _mm_cmpgt_epi32(a_signed, b_signed).simd_into(self) } } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_unpacklo_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_unpackhi_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); _mm_unpacklo_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { let t1 = _mm_shuffle_epi32::<0b11_01_10_00>(a.into()); let t2 = _mm_shuffle_epi32::<0b11_01_10_00>(b.into()); _mm_unpackhi_epi64(t1, t2).simd_into(self) } } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_min_epu32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_max_epu32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { u8x16 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { unsafe { _mm_cvtepi32_ps(a.into()).simd_into(self) } } #[inline(always)] fn splat_mask32x4(self, val: i32) -> mask32x4 { unsafe { _mm_set1_epi32(val).simd_into(self) } } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { a ^ !0 } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask32x4( self, a: mask32x4, b: mask32x4, c: mask32x4, ) -> mask32x4 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_cmpeq_epi32(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { unsafe { _mm_set1_pd(val).simd_into(self) } } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_xor_pd(a.into(), _mm_set1_pd(-0.0)).simd_into(self) } } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_sqrt_pd(a.into()).simd_into(self) } } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_add_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_sub_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_mul_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_div_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { let mask = _mm_set1_pd(-0.0); _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, a.into())).simd_into(self) } } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmpeq_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmplt_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmple_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmpge_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { unsafe { _mm_castpd_si128(_mm_cmpgt_pd(a.into(), b.into())).simd_into(self) } } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_unpacklo_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_unpackhi_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_shuffle_pd::<0b00>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_shuffle_pd::<0b11>(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_max_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { unsafe { _mm_min_pd(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn madd_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { a * b + c } #[inline(always)] fn msub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { a * b - c } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_floor_pd(a.into()).simd_into(self) } } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { a - a.trunc() } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_round_pd(a.into(), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC).simd_into(self) } } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { unsafe { let mask = _mm_castsi128_pd(a.into()); _mm_or_pd(_mm_and_pd(mask, b.into()), _mm_andnot_pd(mask, c.into())).simd_into(self) } } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { let mut result = [0.0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { f32x4 { val: bytemuck::cast(a.val), simd: a.simd, } } #[inline(always)] fn splat_mask64x2(self, val: i64) -> mask64x2 { unsafe { _mm_set1_epi64x(val).simd_into(self) } } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { a ^ !0 } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_or_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_xor_si128(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn select_mask64x2( self, a: mask64x2, b: mask64x2, c: mask64x2, ) -> mask64x2 { unsafe { _mm_or_si128( _mm_and_si128(a.into(), b.into()), _mm_andnot_si128(a.into(), c.into()), ) .simd_into(self) } } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_cmpeq_epi64(a.into(), b.into()).simd_into(self) } } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { let mut result = [0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f32x8(self, a: f32) -> f32x8 { let half = self.splat_f32x4(a); self.combine_f32x4(half, half) } #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) } #[inline(always)] fn neg_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) } #[inline(always)] fn sqrt_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) } #[inline(always)] fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) } #[inline(always)] fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) } #[inline(always)] fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) } #[inline(always)] fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) } #[inline(always)] fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) } #[inline(always)] fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) } #[inline(always)] fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) } #[inline(always)] fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) } #[inline(always)] fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) } #[inline(always)] fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, _) = self.split_f32x8(a); let (b0, _) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) } #[inline(always)] fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (_, a1) = self.split_f32x8(a); let (_, b1) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) } #[inline(always)] fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) } #[inline(always)] fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) } #[inline(always)] fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) } #[inline(always)] fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.max_precise_f32x4(a0, b0), self.max_precise_f32x4(a1, b1), ) } #[inline(always)] fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.min_precise_f32x4(a0, b0), self.min_precise_f32x4(a1, b1), ) } #[inline(always)] fn madd_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1)) } #[inline(always)] fn msub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1)) } #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) } #[inline(always)] fn fract_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) } #[inline(always)] fn trunc_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) } #[inline(always)] fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) } #[inline(always)] fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { let mut result = [0.0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { let (a0, a1) = self.split_f32x8(a); self.combine_f64x2( self.reinterpret_f64_f32x4(a0), self.reinterpret_f64_f32x4(a1), ) } #[inline(always)] fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4( self.reinterpret_i32_f32x4(a0), self.reinterpret_i32_f32x4(a1), ) } #[inline(always)] fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { let (a0, a1) = self.split_f32x8(a); self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) } #[inline(always)] fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4( self.reinterpret_u32_f32x4(a0), self.reinterpret_u32_f32x4(a1), ) } #[inline(always)] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) } #[inline(always)] fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) } #[inline(always)] fn splat_i8x32(self, a: i8) -> i8x32 { let half = self.splat_i8x16(a); self.combine_i8x16(half, half) } #[inline(always)] fn not_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) } #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) } #[inline(always)] fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) } #[inline(always)] fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) } #[inline(always)] fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) } #[inline(always)] fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) } #[inline(always)] fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) } #[inline(always)] fn shr_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b)) } #[inline(always)] fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) } #[inline(always)] fn shl_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shl_i8x16(a0, b), self.shl_i8x16(a1, b)) } #[inline(always)] fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) } #[inline(always)] fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) } #[inline(always)] fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) } #[inline(always)] fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) } #[inline(always)] fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) } #[inline(always)] fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, _) = self.split_i8x32(a); let (b0, _) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) } #[inline(always)] fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (_, a1) = self.split_i8x32(a); let (_, b1) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) } #[inline(always)] fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) } #[inline(always)] fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) } #[inline(always)] fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_i8x32(b); let (c0, c1) = self.split_i8x32(c); self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) } #[inline(always)] fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) } #[inline(always)] fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) } #[inline(always)] fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) } #[inline(always)] fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) } #[inline(always)] fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { let (a0, a1) = self.split_i8x32(a); self.combine_u32x4( self.reinterpret_u32_i8x16(a0), self.reinterpret_u32_i8x16(a1), ) } #[inline(always)] fn splat_u8x32(self, a: u8) -> u8x32 { let half = self.splat_u8x16(a); self.combine_u8x16(half, half) } #[inline(always)] fn not_u8x32(self, a: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) } #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) } #[inline(always)] fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) } #[inline(always)] fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) } #[inline(always)] fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) } #[inline(always)] fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) } #[inline(always)] fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) } #[inline(always)] fn shr_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b)) } #[inline(always)] fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) } #[inline(always)] fn shl_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shl_u8x16(a0, b), self.shl_u8x16(a1, b)) } #[inline(always)] fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) } #[inline(always)] fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) } #[inline(always)] fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) } #[inline(always)] fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) } #[inline(always)] fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) } #[inline(always)] fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, _) = self.split_u8x32(a); let (b0, _) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) } #[inline(always)] fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (_, a1) = self.split_u8x32(a); let (_, b1) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) } #[inline(always)] fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) } #[inline(always)] fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) } #[inline(always)] fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_u8x32(b); let (c0, c1) = self.split_u8x32(c); self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) } #[inline(always)] fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) } #[inline(always)] fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) } #[inline(always)] fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn widen_u8x32(self, a: u8x32) -> u16x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) } #[inline(always)] fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { let (a0, a1) = self.split_u8x32(a); self.combine_u32x4( self.reinterpret_u32_u8x16(a0), self.reinterpret_u32_u8x16(a1), ) } #[inline(always)] fn splat_mask8x32(self, a: i8) -> mask8x32 { let half = self.splat_mask8x16(a); self.combine_mask8x16(half, half) } #[inline(always)] fn not_mask8x32(self, a: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) } #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) } #[inline(always)] fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) } #[inline(always)] fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) } #[inline(always)] fn select_mask8x32( self, a: mask8x32, b: mask8x32, c: mask8x32, ) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); let (c0, c1) = self.split_mask8x32(c); self.combine_mask8x16( self.select_mask8x16(a0, b0, c0), self.select_mask8x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) } #[inline(always)] fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x16(self, a: i16) -> i16x16 { let half = self.splat_i16x8(a); self.combine_i16x8(half, half) } #[inline(always)] fn not_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) } #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) } #[inline(always)] fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) } #[inline(always)] fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) } #[inline(always)] fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) } #[inline(always)] fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) } #[inline(always)] fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) } #[inline(always)] fn shr_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b)) } #[inline(always)] fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) } #[inline(always)] fn shl_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shl_i16x8(a0, b), self.shl_i16x8(a1, b)) } #[inline(always)] fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) } #[inline(always)] fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) } #[inline(always)] fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) } #[inline(always)] fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) } #[inline(always)] fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) } #[inline(always)] fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, _) = self.split_i16x16(a); let (b0, _) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) } #[inline(always)] fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (_, a1) = self.split_i16x16(a); let (_, b1) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) } #[inline(always)] fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) } #[inline(always)] fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) } #[inline(always)] fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_i16x16(b); let (c0, c1) = self.split_i16x16(c); self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) } #[inline(always)] fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) } #[inline(always)] fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) } #[inline(always)] fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) } #[inline(always)] fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { let (a0, a1) = self.split_i16x16(a); self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) } #[inline(always)] fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { let (a0, a1) = self.split_i16x16(a); self.combine_u32x4( self.reinterpret_u32_i16x8(a0), self.reinterpret_u32_i16x8(a1), ) } #[inline(always)] fn splat_u16x16(self, a: u16) -> u16x16 { let half = self.splat_u16x8(a); self.combine_u16x8(half, half) } #[inline(always)] fn not_u16x16(self, a: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) } #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) } #[inline(always)] fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) } #[inline(always)] fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) } #[inline(always)] fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) } #[inline(always)] fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) } #[inline(always)] fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) } #[inline(always)] fn shr_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b)) } #[inline(always)] fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) } #[inline(always)] fn shl_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shl_u16x8(a0, b), self.shl_u16x8(a1, b)) } #[inline(always)] fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) } #[inline(always)] fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) } #[inline(always)] fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) } #[inline(always)] fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) } #[inline(always)] fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) } #[inline(always)] fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, _) = self.split_u16x16(a); let (b0, _) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) } #[inline(always)] fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (_, a1) = self.split_u16x16(a); let (_, b1) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) } #[inline(always)] fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) } #[inline(always)] fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) } #[inline(always)] fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_u16x16(b); let (c0, c1) = self.split_u16x16(c); self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) } #[inline(always)] fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) } #[inline(always)] fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) } #[inline(always)] fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { let (a, b) = self.split_u16x16(a); unsafe { let mask = _mm_set1_epi16(0xFF); let lo_masked = _mm_and_si128(a.into(), mask); let hi_masked = _mm_and_si128(b.into(), mask); let result = _mm_packus_epi16(lo_masked, hi_masked); result.simd_into(self) } } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { let (a0, a1) = self.split_u16x16(a); self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) } #[inline(always)] fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { let (a0, a1) = self.split_u16x16(a); self.combine_u32x4( self.reinterpret_u32_u16x8(a0), self.reinterpret_u32_u16x8(a1), ) } #[inline(always)] fn splat_mask16x16(self, a: i16) -> mask16x16 { let half = self.splat_mask16x8(a); self.combine_mask16x8(half, half) } #[inline(always)] fn not_mask16x16(self, a: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) } #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) } #[inline(always)] fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) } #[inline(always)] fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) } #[inline(always)] fn select_mask16x16( self, a: mask16x16, b: mask16x16, c: mask16x16, ) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); let (c0, c1) = self.split_mask16x16(c); self.combine_mask16x8( self.select_mask16x8(a0, b0, c0), self.select_mask16x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) } #[inline(always)] fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x8(self, a: i32) -> i32x8 { let half = self.splat_i32x4(a); self.combine_i32x4(half, half) } #[inline(always)] fn not_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) } #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) } #[inline(always)] fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) } #[inline(always)] fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) } #[inline(always)] fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) } #[inline(always)] fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) } #[inline(always)] fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) } #[inline(always)] fn shr_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b)) } #[inline(always)] fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) } #[inline(always)] fn shl_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shl_i32x4(a0, b), self.shl_i32x4(a1, b)) } #[inline(always)] fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) } #[inline(always)] fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) } #[inline(always)] fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) } #[inline(always)] fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) } #[inline(always)] fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) } #[inline(always)] fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, _) = self.split_i32x8(a); let (b0, _) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) } #[inline(always)] fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (_, a1) = self.split_i32x8(a); let (_, b1) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) } #[inline(always)] fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) } #[inline(always)] fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) } #[inline(always)] fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_i32x8(b); let (c0, c1) = self.split_i32x8(c); self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) } #[inline(always)] fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) } #[inline(always)] fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) } #[inline(always)] fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) } #[inline(always)] fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { let (a0, a1) = self.split_i32x8(a); self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) } #[inline(always)] fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_u32x4( self.reinterpret_u32_i32x4(a0), self.reinterpret_u32_i32x4(a1), ) } #[inline(always)] fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) } #[inline(always)] fn splat_u32x8(self, a: u32) -> u32x8 { let half = self.splat_u32x4(a); self.combine_u32x4(half, half) } #[inline(always)] fn not_u32x8(self, a: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) } #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) } #[inline(always)] fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) } #[inline(always)] fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) } #[inline(always)] fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) } #[inline(always)] fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) } #[inline(always)] fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) } #[inline(always)] fn shr_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b)) } #[inline(always)] fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) } #[inline(always)] fn shl_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shl_u32x4(a0, b), self.shl_u32x4(a1, b)) } #[inline(always)] fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) } #[inline(always)] fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) } #[inline(always)] fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) } #[inline(always)] fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) } #[inline(always)] fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) } #[inline(always)] fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, _) = self.split_u32x8(a); let (b0, _) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) } #[inline(always)] fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (_, a1) = self.split_u32x8(a); let (_, b1) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) } #[inline(always)] fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) } #[inline(always)] fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) } #[inline(always)] fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_u32x8(b); let (c0, c1) = self.split_u32x8(c); self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) } #[inline(always)] fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) } #[inline(always)] fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) } #[inline(always)] fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { let (a0, a1) = self.split_u32x8(a); self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) } #[inline(always)] fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) } #[inline(always)] fn splat_mask32x8(self, a: i32) -> mask32x8 { let half = self.splat_mask32x4(a); self.combine_mask32x4(half, half) } #[inline(always)] fn not_mask32x8(self, a: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) } #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) } #[inline(always)] fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) } #[inline(always)] fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) } #[inline(always)] fn select_mask32x8( self, a: mask32x8, b: mask32x8, c: mask32x8, ) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); let (c0, c1) = self.split_mask32x8(c); self.combine_mask32x4( self.select_mask32x4(a0, b0, c0), self.select_mask32x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) } #[inline(always)] fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x4(self, a: f64) -> f64x4 { let half = self.splat_f64x2(a); self.combine_f64x2(half, half) } #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) } #[inline(always)] fn neg_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) } #[inline(always)] fn sqrt_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) } #[inline(always)] fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) } #[inline(always)] fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) } #[inline(always)] fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) } #[inline(always)] fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) } #[inline(always)] fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) } #[inline(always)] fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) } #[inline(always)] fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) } #[inline(always)] fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) } #[inline(always)] fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) } #[inline(always)] fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, _) = self.split_f64x4(a); let (b0, _) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) } #[inline(always)] fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (_, a1) = self.split_f64x4(a); let (_, b1) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) } #[inline(always)] fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) } #[inline(always)] fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) } #[inline(always)] fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) } #[inline(always)] fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.max_precise_f64x2(a0, b0), self.max_precise_f64x2(a1, b1), ) } #[inline(always)] fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) } #[inline(always)] fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.min_precise_f64x2(a0, b0), self.min_precise_f64x2(a1, b1), ) } #[inline(always)] fn madd_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1)) } #[inline(always)] fn msub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1)) } #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) } #[inline(always)] fn fract_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) } #[inline(always)] fn trunc_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) } #[inline(always)] fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) } #[inline(always)] fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { let mut b0 = [0.0; 2usize]; let mut b1 = [0.0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { let (a0, a1) = self.split_f64x4(a); self.combine_f32x4( self.reinterpret_f32_f64x2(a0), self.reinterpret_f32_f64x2(a1), ) } #[inline(always)] fn splat_mask64x4(self, a: i64) -> mask64x4 { let half = self.splat_mask64x2(a); self.combine_mask64x2(half, half) } #[inline(always)] fn not_mask64x4(self, a: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) } #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) } #[inline(always)] fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) } #[inline(always)] fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) } #[inline(always)] fn select_mask64x4( self, a: mask64x4, b: mask64x4, c: mask64x4, ) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); let (c0, c1) = self.split_mask64x4(c); self.combine_mask64x2( self.select_mask64x2(a0, b0, c0), self.select_mask64x2(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) } #[inline(always)] fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { let mut b0 = [0; 2usize]; let mut b1 = [0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f32x16(self, a: f32) -> f32x16 { let half = self.splat_f32x8(a); self.combine_f32x8(half, half) } #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) } #[inline(always)] fn neg_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) } #[inline(always)] fn sqrt_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) } #[inline(always)] fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) } #[inline(always)] fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) } #[inline(always)] fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) } #[inline(always)] fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) } #[inline(always)] fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) } #[inline(always)] fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) } #[inline(always)] fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) } #[inline(always)] fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) } #[inline(always)] fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) } #[inline(always)] fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, _) = self.split_f32x16(a); let (b0, _) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) } #[inline(always)] fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (_, a1) = self.split_f32x16(a); let (_, b1) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) } #[inline(always)] fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) } #[inline(always)] fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) } #[inline(always)] fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) } #[inline(always)] fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.max_precise_f32x8(a0, b0), self.max_precise_f32x8(a1, b1), ) } #[inline(always)] fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) } #[inline(always)] fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.min_precise_f32x8(a0, b0), self.min_precise_f32x8(a1, b1), ) } #[inline(always)] fn madd_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1)) } #[inline(always)] fn msub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1)) } #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) } #[inline(always)] fn fract_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) } #[inline(always)] fn trunc_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) } #[inline(always)] fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) } #[inline(always)] fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { let mut b0 = [0.0; 8usize]; let mut b1 = [0.0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { let (a0, a1) = self.split_f32x16(a); self.combine_f64x4( self.reinterpret_f64_f32x8(a0), self.reinterpret_f64_f32x8(a1), ) } #[inline(always)] fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8( self.reinterpret_i32_f32x8(a0), self.reinterpret_i32_f32x8(a1), ) } #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { crate::Fallback::new() .load_interleaved_128_f32x16(src) .val .simd_into(self) } #[inline(always)] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { let fb = crate::Fallback::new(); fb.store_interleaved_128_f32x16(a.val.simd_into(fb), dest); } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) } #[inline(always)] fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8( self.reinterpret_u32_f32x8(a0), self.reinterpret_u32_f32x8(a1), ) } #[inline(always)] fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) } #[inline(always)] fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) } #[inline(always)] fn splat_i8x64(self, a: i8) -> i8x64 { let half = self.splat_i8x32(a); self.combine_i8x32(half, half) } #[inline(always)] fn not_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) } #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) } #[inline(always)] fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) } #[inline(always)] fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) } #[inline(always)] fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) } #[inline(always)] fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) } #[inline(always)] fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) } #[inline(always)] fn shr_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b)) } #[inline(always)] fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) } #[inline(always)] fn shl_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shl_i8x32(a0, b), self.shl_i8x32(a1, b)) } #[inline(always)] fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) } #[inline(always)] fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) } #[inline(always)] fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) } #[inline(always)] fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) } #[inline(always)] fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) } #[inline(always)] fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, _) = self.split_i8x64(a); let (b0, _) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) } #[inline(always)] fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (_, a1) = self.split_i8x64(a); let (_, b1) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) } #[inline(always)] fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) } #[inline(always)] fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) } #[inline(always)] fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_i8x64(b); let (c0, c1) = self.split_i8x64(c); self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) } #[inline(always)] fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) } #[inline(always)] fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) } #[inline(always)] fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) } #[inline(always)] fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) } #[inline(always)] fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { let (a0, a1) = self.split_i8x64(a); self.combine_u32x8( self.reinterpret_u32_i8x32(a0), self.reinterpret_u32_i8x32(a1), ) } #[inline(always)] fn splat_u8x64(self, a: u8) -> u8x64 { let half = self.splat_u8x32(a); self.combine_u8x32(half, half) } #[inline(always)] fn not_u8x64(self, a: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) } #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) } #[inline(always)] fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) } #[inline(always)] fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) } #[inline(always)] fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) } #[inline(always)] fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) } #[inline(always)] fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) } #[inline(always)] fn shr_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b)) } #[inline(always)] fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) } #[inline(always)] fn shl_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shl_u8x32(a0, b), self.shl_u8x32(a1, b)) } #[inline(always)] fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) } #[inline(always)] fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) } #[inline(always)] fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) } #[inline(always)] fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) } #[inline(always)] fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) } #[inline(always)] fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, _) = self.split_u8x64(a); let (b0, _) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) } #[inline(always)] fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (_, a1) = self.split_u8x64(a); let (_, b1) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) } #[inline(always)] fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) } #[inline(always)] fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) } #[inline(always)] fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_u8x64(b); let (c0, c1) = self.split_u8x64(c); self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) } #[inline(always)] fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) } #[inline(always)] fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) } #[inline(always)] fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { crate::Fallback::new() .load_interleaved_128_u8x64(src) .val .simd_into(self) } #[inline(always)] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { let fb = crate::Fallback::new(); fb.store_interleaved_128_u8x64(a.val.simd_into(fb), dest); } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { let (a0, a1) = self.split_u8x64(a); self.combine_u32x8( self.reinterpret_u32_u8x32(a0), self.reinterpret_u32_u8x32(a1), ) } #[inline(always)] fn splat_mask8x64(self, a: i8) -> mask8x64 { let half = self.splat_mask8x32(a); self.combine_mask8x32(half, half) } #[inline(always)] fn not_mask8x64(self, a: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) } #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) } #[inline(always)] fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) } #[inline(always)] fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) } #[inline(always)] fn select_mask8x64( self, a: mask8x64, b: mask8x64, c: mask8x64, ) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); let (c0, c1) = self.split_mask8x64(c); self.combine_mask8x32( self.select_mask8x32(a0, b0, c0), self.select_mask8x32(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) } #[inline(always)] fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x32(self, a: i16) -> i16x32 { let half = self.splat_i16x16(a); self.combine_i16x16(half, half) } #[inline(always)] fn not_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) } #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) } #[inline(always)] fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) } #[inline(always)] fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) } #[inline(always)] fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) } #[inline(always)] fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) } #[inline(always)] fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) } #[inline(always)] fn shr_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b)) } #[inline(always)] fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) } #[inline(always)] fn shl_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shl_i16x16(a0, b), self.shl_i16x16(a1, b)) } #[inline(always)] fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) } #[inline(always)] fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) } #[inline(always)] fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) } #[inline(always)] fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) } #[inline(always)] fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) } #[inline(always)] fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, _) = self.split_i16x32(a); let (b0, _) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) } #[inline(always)] fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (_, a1) = self.split_i16x32(a); let (_, b1) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) } #[inline(always)] fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) } #[inline(always)] fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16( self.unzip_high_i16x16(a0, a1), self.unzip_high_i16x16(b0, b1), ) } #[inline(always)] fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_i16x32(b); let (c0, c1) = self.split_i16x32(c); self.combine_i16x16( self.select_i16x16(a0, b0, c0), self.select_i16x16(a1, b1, c1), ) } #[inline(always)] fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) } #[inline(always)] fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) } #[inline(always)] fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) } #[inline(always)] fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { let (a0, a1) = self.split_i16x32(a); self.combine_u8x32( self.reinterpret_u8_i16x16(a0), self.reinterpret_u8_i16x16(a1), ) } #[inline(always)] fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { let (a0, a1) = self.split_i16x32(a); self.combine_u32x8( self.reinterpret_u32_i16x16(a0), self.reinterpret_u32_i16x16(a1), ) } #[inline(always)] fn splat_u16x32(self, a: u16) -> u16x32 { let half = self.splat_u16x16(a); self.combine_u16x16(half, half) } #[inline(always)] fn not_u16x32(self, a: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) } #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) } #[inline(always)] fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) } #[inline(always)] fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) } #[inline(always)] fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) } #[inline(always)] fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) } #[inline(always)] fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) } #[inline(always)] fn shr_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b)) } #[inline(always)] fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) } #[inline(always)] fn shl_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shl_u16x16(a0, b), self.shl_u16x16(a1, b)) } #[inline(always)] fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) } #[inline(always)] fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) } #[inline(always)] fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) } #[inline(always)] fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) } #[inline(always)] fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) } #[inline(always)] fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, _) = self.split_u16x32(a); let (b0, _) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) } #[inline(always)] fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (_, a1) = self.split_u16x32(a); let (_, b1) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) } #[inline(always)] fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) } #[inline(always)] fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16( self.unzip_high_u16x16(a0, a1), self.unzip_high_u16x16(b0, b1), ) } #[inline(always)] fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_u16x32(b); let (c0, c1) = self.split_u16x32(c); self.combine_u16x16( self.select_u16x16(a0, b0, c0), self.select_u16x16(a1, b1, c1), ) } #[inline(always)] fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) } #[inline(always)] fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) } #[inline(always)] fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { crate::Fallback::new() .load_interleaved_128_u16x32(src) .val .simd_into(self) } #[inline(always)] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { let fb = crate::Fallback::new(); fb.store_interleaved_128_u16x32(a.val.simd_into(fb), dest); } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) } #[inline(always)] fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x32( self.reinterpret_u8_u16x16(a0), self.reinterpret_u8_u16x16(a1), ) } #[inline(always)] fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { let (a0, a1) = self.split_u16x32(a); self.combine_u32x8( self.reinterpret_u32_u16x16(a0), self.reinterpret_u32_u16x16(a1), ) } #[inline(always)] fn splat_mask16x32(self, a: i16) -> mask16x32 { let half = self.splat_mask16x16(a); self.combine_mask16x16(half, half) } #[inline(always)] fn not_mask16x32(self, a: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) } #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) } #[inline(always)] fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) } #[inline(always)] fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) } #[inline(always)] fn select_mask16x32( self, a: mask16x32, b: mask16x32, c: mask16x32, ) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); let (c0, c1) = self.split_mask16x32(c); self.combine_mask16x16( self.select_mask16x16(a0, b0, c0), self.select_mask16x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16( self.simd_eq_mask16x16(a0, b0), self.simd_eq_mask16x16(a1, b1), ) } #[inline(always)] fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x16(self, a: i32) -> i32x16 { let half = self.splat_i32x8(a); self.combine_i32x8(half, half) } #[inline(always)] fn not_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) } #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) } #[inline(always)] fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) } #[inline(always)] fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) } #[inline(always)] fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) } #[inline(always)] fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) } #[inline(always)] fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) } #[inline(always)] fn shr_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b)) } #[inline(always)] fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) } #[inline(always)] fn shl_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shl_i32x8(a0, b), self.shl_i32x8(a1, b)) } #[inline(always)] fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) } #[inline(always)] fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) } #[inline(always)] fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) } #[inline(always)] fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) } #[inline(always)] fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) } #[inline(always)] fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, _) = self.split_i32x16(a); let (b0, _) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) } #[inline(always)] fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (_, a1) = self.split_i32x16(a); let (_, b1) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) } #[inline(always)] fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) } #[inline(always)] fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) } #[inline(always)] fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_i32x16(b); let (c0, c1) = self.split_i32x16(c); self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) } #[inline(always)] fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) } #[inline(always)] fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) } #[inline(always)] fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) } #[inline(always)] fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { let (a0, a1) = self.split_i32x16(a); self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) } #[inline(always)] fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_u32x8( self.reinterpret_u32_i32x8(a0), self.reinterpret_u32_i32x8(a1), ) } #[inline(always)] fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) } #[inline(always)] fn splat_u32x16(self, a: u32) -> u32x16 { let half = self.splat_u32x8(a); self.combine_u32x8(half, half) } #[inline(always)] fn not_u32x16(self, a: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) } #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) } #[inline(always)] fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) } #[inline(always)] fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) } #[inline(always)] fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) } #[inline(always)] fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) } #[inline(always)] fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) } #[inline(always)] fn shr_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b)) } #[inline(always)] fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) } #[inline(always)] fn shl_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shl_u32x8(a0, b), self.shl_u32x8(a1, b)) } #[inline(always)] fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) } #[inline(always)] fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) } #[inline(always)] fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) } #[inline(always)] fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) } #[inline(always)] fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) } #[inline(always)] fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, _) = self.split_u32x16(a); let (b0, _) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) } #[inline(always)] fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (_, a1) = self.split_u32x16(a); let (_, b1) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) } #[inline(always)] fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) } #[inline(always)] fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) } #[inline(always)] fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_u32x16(b); let (c0, c1) = self.split_u32x16(c); self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) } #[inline(always)] fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) } #[inline(always)] fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) } #[inline(always)] fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { unsafe { let v0 = _mm_loadu_si128(src.as_ptr().add(0) as *const __m128i); let v1 = _mm_loadu_si128(src.as_ptr().add(4) as *const __m128i); let v2 = _mm_loadu_si128(src.as_ptr().add(8) as *const __m128i); let v3 = _mm_loadu_si128(src.as_ptr().add(12) as *const __m128i); let tmp0 = _mm_unpacklo_epi32(v0, v1); let tmp1 = _mm_unpackhi_epi32(v0, v1); let tmp2 = _mm_unpacklo_epi32(v2, v3); let tmp3 = _mm_unpackhi_epi32(v2, v3); let out0 = _mm_unpacklo_epi64(tmp0, tmp2); let out1 = _mm_unpackhi_epi64(tmp0, tmp2); let out2 = _mm_unpacklo_epi64(tmp1, tmp3); let out3 = _mm_unpackhi_epi64(tmp1, tmp3); self.combine_u32x8( self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)), self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)), ) } } #[inline(always)] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { let fb = crate::Fallback::new(); fb.store_interleaved_128_u32x16(a.val.simd_into(fb), dest); } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { let (a0, a1) = self.split_u32x16(a); self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) } #[inline(always)] fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) } #[inline(always)] fn splat_mask32x16(self, a: i32) -> mask32x16 { let half = self.splat_mask32x8(a); self.combine_mask32x8(half, half) } #[inline(always)] fn not_mask32x16(self, a: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) } #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) } #[inline(always)] fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) } #[inline(always)] fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) } #[inline(always)] fn select_mask32x16( self, a: mask32x16, b: mask32x16, c: mask32x16, ) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); let (c0, c1) = self.split_mask32x16(c); self.combine_mask32x8( self.select_mask32x8(a0, b0, c0), self.select_mask32x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) } #[inline(always)] fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x8(self, a: f64) -> f64x8 { let half = self.splat_f64x4(a); self.combine_f64x4(half, half) } #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) } #[inline(always)] fn neg_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) } #[inline(always)] fn sqrt_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) } #[inline(always)] fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) } #[inline(always)] fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) } #[inline(always)] fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) } #[inline(always)] fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) } #[inline(always)] fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) } #[inline(always)] fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) } #[inline(always)] fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) } #[inline(always)] fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) } #[inline(always)] fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) } #[inline(always)] fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, _) = self.split_f64x8(a); let (b0, _) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) } #[inline(always)] fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (_, a1) = self.split_f64x8(a); let (_, b1) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) } #[inline(always)] fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) } #[inline(always)] fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) } #[inline(always)] fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) } #[inline(always)] fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.max_precise_f64x4(a0, b0), self.max_precise_f64x4(a1, b1), ) } #[inline(always)] fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) } #[inline(always)] fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.min_precise_f64x4(a0, b0), self.min_precise_f64x4(a1, b1), ) } #[inline(always)] fn madd_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1)) } #[inline(always)] fn msub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1)) } #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) } #[inline(always)] fn fract_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) } #[inline(always)] fn trunc_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) } #[inline(always)] fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) } #[inline(always)] fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( self.reinterpret_f32_f64x4(a0), self.reinterpret_f32_f64x4(a1), ) } #[inline(always)] fn splat_mask64x8(self, a: i64) -> mask64x8 { let half = self.splat_mask64x4(a); self.combine_mask64x4(half, half) } #[inline(always)] fn not_mask64x8(self, a: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1)) } #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1)) } #[inline(always)] fn or_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1)) } #[inline(always)] fn xor_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1)) } #[inline(always)] fn select_mask64x8( self, a: mask64x8, b: mask64x8, c: mask64x8, ) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); let (c0, c1) = self.split_mask64x8(c); self.combine_mask64x4( self.select_mask64x4(a0, b0, c0), self.select_mask64x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1)) } #[inline(always)] fn split_mask64x8(self, a: mask64x8) -> (mask64x4, mask64x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } } impl SimdFrom<__m128, S> for f32x4 { #[inline(always)] fn simd_from(arch: __m128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128 { #[inline(always)] fn from(value: f32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for i8x16 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: i8x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for u8x16 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: u8x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for mask8x16 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: mask8x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for i16x8 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: i16x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for u16x8 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: u16x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for mask16x8 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: mask16x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for i32x4 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: i32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for u32x4 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: u32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for mask32x4 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: mask32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128d, S> for f64x2 { #[inline(always)] fn simd_from(arch: __m128d, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128d { #[inline(always)] fn from(value: f64x2) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom<__m128i, S> for mask64x2 { #[inline(always)] fn simd_from(arch: __m128i, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for __m128i { #[inline(always)] fn from(value: mask64x2) -> Self { unsafe { core::mem::transmute(value.val) } } } fearless_simd-0.3.0/src/generated/wasm.rs000064400000000000000000006131641046102023000165210ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT // This file is autogenerated by fearless_simd_gen use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal}; use crate::{ f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4, i32x8, i32x16, mask8x16, mask8x32, mask8x64, mask16x8, mask16x16, mask16x32, mask32x4, mask32x8, mask32x16, mask64x2, mask64x4, mask64x8, u8x16, u8x32, u8x64, u16x8, u16x16, u16x32, u32x4, u32x8, u32x16, }; use core::arch::wasm32::*; #[doc = r#" The SIMD token for the "wasm128" level."#] #[derive(Clone, Copy, Debug)] pub struct WasmSimd128 { _private: (), } impl WasmSimd128 { #[inline] pub const fn new_unchecked() -> Self { Self { _private: () } } } impl Seal for WasmSimd128 {} impl Simd for WasmSimd128 { type f32s = f32x4; type u8s = u8x16; type i8s = i8x16; type u16s = u16x8; type i16s = i16x8; type u32s = u32x4; type i32s = i32x4; type mask8s = mask8x16; type mask16s = mask16x8; type mask32s = mask32x4; #[inline(always)] fn level(self) -> Level { Level::WasmSimd128(self) } #[inline] fn vectorize R, R>(self, f: F) -> R { #[inline] unsafe fn vectorize_simd128 R, R>(f: F) -> R { f() } unsafe { vectorize_simd128(f) } } #[inline(always)] fn splat_f32x4(self, val: f32) -> f32x4 { f32x4_splat(val).simd_into(self) } #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { f32x4_abs(a.into()).simd_into(self) } #[inline(always)] fn neg_f32x4(self, a: f32x4) -> f32x4 { f32x4_neg(a.into()).simd_into(self) } #[inline(always)] fn sqrt_f32x4(self, a: f32x4) -> f32x4 { f32x4_sqrt(a.into()).simd_into(self) } #[inline(always)] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { f32x4_add(a.into(), b.into()).simd_into(self) } #[inline(always)] fn sub_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { f32x4_sub(a.into(), b.into()).simd_into(self) } #[inline(always)] fn mul_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { f32x4_mul(a.into(), b.into()).simd_into(self) } #[inline(always)] fn div_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { f32x4_div(a.into(), b.into()).simd_into(self) } #[inline(always)] fn copysign_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { let sign_mask = f32x4_splat(-0.0_f32); let sign_bits = v128_and(b.into(), sign_mask.into()); let magnitude = v128_andnot(a.into(), sign_mask.into()); v128_or(magnitude, sign_bits).simd_into(self) } #[inline(always)] fn simd_eq_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { f32x4_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_lt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { f32x4_lt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_le_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { f32x4_le(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_ge_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { f32x4_ge(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_gt_f32x4(self, a: f32x4, b: f32x4) -> mask32x4 { f32x4_gt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { u32x4_shuffle::<0, 4, 1, 5>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { u32x4_shuffle::<2, 6, 3, 7>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_low_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { u32x4_shuffle::<0, 2, 4, 6>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_high_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { u32x4_shuffle::<1, 3, 5, 7>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { f32x4_max(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { f32x4_pmax(b.into(), a.into()).simd_into(self) } #[inline(always)] fn min_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { f32x4_min(a.into(), b.into()).simd_into(self) } #[inline(always)] fn min_precise_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { f32x4_pmin(b.into(), a.into()).simd_into(self) } #[inline(always)] fn madd_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { a.mul(b).add(c) } #[inline(always)] fn msub_f32x4(self, a: f32x4, b: f32x4, c: f32x4) -> f32x4 { a.mul(b).sub(c) } #[inline(always)] fn floor_f32x4(self, a: f32x4) -> f32x4 { f32x4_floor(a.into()).simd_into(self) } #[inline(always)] fn fract_f32x4(self, a: f32x4) -> f32x4 { a.sub(a.trunc()) } #[inline(always)] fn trunc_f32x4(self, a: f32x4) -> f32x4 { f32x4_trunc(a.into()).simd_into(self) } #[inline(always)] fn select_f32x4(self, a: mask32x4, b: f32x4, c: f32x4) -> f32x4 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn combine_f32x4(self, a: f32x4, b: f32x4) -> f32x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f64_f32x4(self, a: f32x4) -> f64x2 { ::from(a).simd_into(self) } #[inline(always)] fn reinterpret_i32_f32x4(self, a: f32x4) -> i32x4 { ::from(a).simd_into(self) } #[inline(always)] fn reinterpret_u8_f32x4(self, a: f32x4) -> u8x16 { ::from(a).simd_into(self) } #[inline(always)] fn reinterpret_u32_f32x4(self, a: f32x4) -> u32x4 { ::from(a).simd_into(self) } #[inline(always)] fn cvt_u32_f32x4(self, a: f32x4) -> u32x4 { u32x4_trunc_sat_f32x4(a.into()).simd_into(self) } #[inline(always)] fn cvt_i32_f32x4(self, a: f32x4) -> i32x4 { i32x4_trunc_sat_f32x4(a.into()).simd_into(self) } #[inline(always)] fn splat_i8x16(self, val: i8) -> i8x16 { i8x16_splat(val).simd_into(self) } #[inline(always)] fn not_i8x16(self, a: i8x16) -> i8x16 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { i8x16_add(a.into(), b.into()).simd_into(self) } #[inline(always)] fn sub_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { i8x16_sub(a.into(), b.into()).simd_into(self) } #[inline(always)] fn mul_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { let low = i16x8_extmul_low_i8x16(a.into(), b.into()); let high = i16x8_extmul_high_i8x16(a.into(), b.into()); u8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(low, high) .simd_into(self) } #[inline(always)] fn and_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn shr_i8x16(self, a: i8x16, shift: u32) -> i8x16 { i8x16_shr(a.into(), shift).simd_into(self) } #[inline(always)] fn shrv_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_i8x16(self, a: i8x16, shift: u32) -> i8x16 { i8x16_shl(a.into(), shift).simd_into(self) } #[inline(always)] fn simd_eq_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { i8x16_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_lt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { i8x16_lt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_le_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { i8x16_le(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_ge_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { i8x16_ge(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_gt_i8x16(self, a: i8x16, b: i8x16) -> mask8x16 { i8x16_gt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(a.into(), b.into()) .simd_into(self) } #[inline(always)] fn zip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( a.into(), b.into(), ) .simd_into(self) } #[inline(always)] fn unzip_low_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { u8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>( a.into(), b.into(), ) .simd_into(self) } #[inline(always)] fn unzip_high_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { u8x16_shuffle::<1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31>( a.into(), b.into(), ) .simd_into(self) } #[inline(always)] fn select_i8x16(self, a: mask8x16, b: i8x16, c: i8x16) -> i8x16 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn min_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { i8x16_min(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { i8x16_max(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_i8x16(self, a: i8x16, b: i8x16) -> i8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i8x16(self, a: i8x16) -> i8x16 { i8x16_neg(a.into()).simd_into(self) } #[inline(always)] fn reinterpret_u8_i8x16(self, a: i8x16) -> u8x16 { ::from(a).simd_into(self) } #[inline(always)] fn reinterpret_u32_i8x16(self, a: i8x16) -> u32x4 { ::from(a).simd_into(self) } #[inline(always)] fn splat_u8x16(self, val: u8) -> u8x16 { u8x16_splat(val).simd_into(self) } #[inline(always)] fn not_u8x16(self, a: u8x16) -> u8x16 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { u8x16_add(a.into(), b.into()).simd_into(self) } #[inline(always)] fn sub_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { u8x16_sub(a.into(), b.into()).simd_into(self) } #[inline(always)] fn mul_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { let low = u16x8_extmul_low_u8x16(a.into(), b.into()); let high = u16x8_extmul_high_u8x16(a.into(), b.into()); u8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>(low, high) .simd_into(self) } #[inline(always)] fn and_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn shr_u8x16(self, a: u8x16, shift: u32) -> u8x16 { u8x16_shr(a.into(), shift).simd_into(self) } #[inline(always)] fn shrv_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_u8x16(self, a: u8x16, shift: u32) -> u8x16 { u8x16_shl(a.into(), shift).simd_into(self) } #[inline(always)] fn simd_eq_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { u8x16_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_lt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { u8x16_lt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_le_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { u8x16_le(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_ge_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { u8x16_ge(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_gt_u8x16(self, a: u8x16, b: u8x16) -> mask8x16 { u8x16_gt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(a.into(), b.into()) .simd_into(self) } #[inline(always)] fn zip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( a.into(), b.into(), ) .simd_into(self) } #[inline(always)] fn unzip_low_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { u8x16_shuffle::<0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30>( a.into(), b.into(), ) .simd_into(self) } #[inline(always)] fn unzip_high_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { u8x16_shuffle::<1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31>( a.into(), b.into(), ) .simd_into(self) } #[inline(always)] fn select_u8x16(self, a: mask8x16, b: u8x16, c: u8x16) -> u8x16 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn min_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { u8x16_min(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { u8x16_max(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_u8x16(self, a: u8x16, b: u8x16) -> u8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn widen_u8x16(self, a: u8x16) -> u16x16 { let low = u16x8_extend_low_u8x16(a.into()); let high = u16x8_extend_high_u8x16(a.into()); self.combine_u16x8(low.simd_into(self), high.simd_into(self)) } #[inline(always)] fn reinterpret_u32_u8x16(self, a: u8x16) -> u32x4 { ::from(a).simd_into(self) } #[inline(always)] fn splat_mask8x16(self, val: i8) -> mask8x16 { i8x16_splat(val).simd_into(self) } #[inline(always)] fn not_mask8x16(self, a: mask8x16) -> mask8x16 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn select_mask8x16( self, a: mask8x16, b: mask8x16, c: mask8x16, ) -> mask8x16 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn simd_eq_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { i8x16_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i16x8(self, val: i16) -> i16x8 { i16x8_splat(val).simd_into(self) } #[inline(always)] fn not_i16x8(self, a: i16x8) -> i16x8 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { i16x8_add(a.into(), b.into()).simd_into(self) } #[inline(always)] fn sub_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { i16x8_sub(a.into(), b.into()).simd_into(self) } #[inline(always)] fn mul_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { i16x8_mul(a.into(), b.into()).simd_into(self) } #[inline(always)] fn and_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn shr_i16x8(self, a: i16x8, shift: u32) -> i16x8 { i16x8_shr(a.into(), shift).simd_into(self) } #[inline(always)] fn shrv_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_i16x8(self, a: i16x8, shift: u32) -> i16x8 { i16x8_shl(a.into(), shift).simd_into(self) } #[inline(always)] fn simd_eq_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { i16x8_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_lt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { i16x8_lt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_le_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { i16x8_le(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_ge_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { i16x8_ge(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_gt_i16x8(self, a: i16x8, b: i16x8) -> mask16x8 { i16x8_gt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_low_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { u16x8_shuffle::<0, 2, 4, 6, 8, 10, 12, 14>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_high_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { u16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn select_i16x8(self, a: mask16x8, b: i16x8, c: i16x8) -> i16x8 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn min_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { i16x8_min(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { i16x8_max(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_i16x8(self, a: i16x8, b: i16x8) -> i16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i16x8(self, a: i16x8) -> i16x8 { i16x8_neg(a.into()).simd_into(self) } #[inline(always)] fn reinterpret_u8_i16x8(self, a: i16x8) -> u8x16 { ::from(a).simd_into(self) } #[inline(always)] fn reinterpret_u32_i16x8(self, a: i16x8) -> u32x4 { ::from(a).simd_into(self) } #[inline(always)] fn splat_u16x8(self, val: u16) -> u16x8 { u16x8_splat(val).simd_into(self) } #[inline(always)] fn not_u16x8(self, a: u16x8) -> u16x8 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_add(a.into(), b.into()).simd_into(self) } #[inline(always)] fn sub_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_sub(a.into(), b.into()).simd_into(self) } #[inline(always)] fn mul_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_mul(a.into(), b.into()).simd_into(self) } #[inline(always)] fn and_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn shr_u16x8(self, a: u16x8, shift: u32) -> u16x8 { u16x8_shr(a.into(), shift).simd_into(self) } #[inline(always)] fn shrv_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_u16x8(self, a: u16x8, shift: u32) -> u16x8 { u16x8_shl(a.into(), shift).simd_into(self) } #[inline(always)] fn simd_eq_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { u16x8_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_lt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { u16x8_lt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_le_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { u16x8_le(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_ge_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { u16x8_ge(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_gt_u16x8(self, a: u16x8, b: u16x8) -> mask16x8 { u16x8_gt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_low_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_shuffle::<0, 2, 4, 6, 8, 10, 12, 14>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_high_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn select_u16x8(self, a: mask16x8, b: u16x8, c: u16x8) -> u16x8 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn min_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_min(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_max(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_u16x8(self, a: u16x8, b: u16x8) -> u16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u16x8(self, a: u16x8) -> u8x16 { ::from(a).simd_into(self) } #[inline(always)] fn reinterpret_u32_u16x8(self, a: u16x8) -> u32x4 { ::from(a).simd_into(self) } #[inline(always)] fn splat_mask16x8(self, val: i16) -> mask16x8 { i16x8_splat(val).simd_into(self) } #[inline(always)] fn not_mask16x8(self, a: mask16x8) -> mask16x8 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn select_mask16x8( self, a: mask16x8, b: mask16x8, c: mask16x8, ) -> mask16x8 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn simd_eq_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { i16x8_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_i32x4(self, val: i32) -> i32x4 { i32x4_splat(val).simd_into(self) } #[inline(always)] fn not_i32x4(self, a: i32x4) -> i32x4 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { i32x4_add(a.into(), b.into()).simd_into(self) } #[inline(always)] fn sub_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { i32x4_sub(a.into(), b.into()).simd_into(self) } #[inline(always)] fn mul_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { i32x4_mul(a.into(), b.into()).simd_into(self) } #[inline(always)] fn and_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn shr_i32x4(self, a: i32x4, shift: u32) -> i32x4 { i32x4_shr(a.into(), shift).simd_into(self) } #[inline(always)] fn shrv_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_i32x4(self, a: i32x4, shift: u32) -> i32x4 { i32x4_shl(a.into(), shift).simd_into(self) } #[inline(always)] fn simd_eq_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { i32x4_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_lt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { i32x4_lt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_le_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { i32x4_le(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_ge_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { i32x4_ge(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_gt_i32x4(self, a: i32x4, b: i32x4) -> mask32x4 { i32x4_gt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { u32x4_shuffle::<0, 4, 1, 5>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { u32x4_shuffle::<2, 6, 3, 7>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_low_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { u32x4_shuffle::<0, 2, 4, 6>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_high_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { u32x4_shuffle::<1, 3, 5, 7>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn select_i32x4(self, a: mask32x4, b: i32x4, c: i32x4) -> i32x4 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn min_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { i32x4_min(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { i32x4_max(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_i32x4(self, a: i32x4, b: i32x4) -> i32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn neg_i32x4(self, a: i32x4) -> i32x4 { i32x4_neg(a.into()).simd_into(self) } #[inline(always)] fn reinterpret_u8_i32x4(self, a: i32x4) -> u8x16 { ::from(a).simd_into(self) } #[inline(always)] fn reinterpret_u32_i32x4(self, a: i32x4) -> u32x4 { ::from(a).simd_into(self) } #[inline(always)] fn cvt_f32_i32x4(self, a: i32x4) -> f32x4 { f32x4_convert_i32x4(a.into()).simd_into(self) } #[inline(always)] fn splat_u32x4(self, val: u32) -> u32x4 { u32x4_splat(val).simd_into(self) } #[inline(always)] fn not_u32x4(self, a: u32x4) -> u32x4 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_add(a.into(), b.into()).simd_into(self) } #[inline(always)] fn sub_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_sub(a.into(), b.into()).simd_into(self) } #[inline(always)] fn mul_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_mul(a.into(), b.into()).simd_into(self) } #[inline(always)] fn and_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn shr_u32x4(self, a: u32x4, shift: u32) -> u32x4 { u32x4_shr(a.into(), shift).simd_into(self) } #[inline(always)] fn shrv_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { core::array::from_fn(|i| core::ops::Shr::shr(a.val[i], b.val[i])).simd_into(self) } #[inline(always)] fn shl_u32x4(self, a: u32x4, shift: u32) -> u32x4 { u32x4_shl(a.into(), shift).simd_into(self) } #[inline(always)] fn simd_eq_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { u32x4_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_lt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { u32x4_lt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_le_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { u32x4_le(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_ge_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { u32x4_ge(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_gt_u32x4(self, a: u32x4, b: u32x4) -> mask32x4 { u32x4_gt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_shuffle::<0, 4, 1, 5>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_shuffle::<2, 6, 3, 7>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_low_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_shuffle::<0, 2, 4, 6>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_high_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_shuffle::<1, 3, 5, 7>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn select_u32x4(self, a: mask32x4, b: u32x4, c: u32x4) -> u32x4 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn min_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_min(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_max(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_u32x4(self, a: u32x4, b: u32x4) -> u32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u32x4(self, a: u32x4) -> u8x16 { ::from(a).simd_into(self) } #[inline(always)] fn cvt_f32_u32x4(self, a: u32x4) -> f32x4 { f32x4_convert_u32x4(a.into()).simd_into(self) } #[inline(always)] fn splat_mask32x4(self, val: i32) -> mask32x4 { i32x4_splat(val).simd_into(self) } #[inline(always)] fn not_mask32x4(self, a: mask32x4) -> mask32x4 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn select_mask32x4( self, a: mask32x4, b: mask32x4, c: mask32x4, ) -> mask32x4 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn simd_eq_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { i32x4_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f64x2(self, val: f64) -> f64x2 { f64x2_splat(val).simd_into(self) } #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { f64x2_abs(a.into()).simd_into(self) } #[inline(always)] fn neg_f64x2(self, a: f64x2) -> f64x2 { f64x2_neg(a.into()).simd_into(self) } #[inline(always)] fn sqrt_f64x2(self, a: f64x2) -> f64x2 { f64x2_sqrt(a.into()).simd_into(self) } #[inline(always)] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { f64x2_add(a.into(), b.into()).simd_into(self) } #[inline(always)] fn sub_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { f64x2_sub(a.into(), b.into()).simd_into(self) } #[inline(always)] fn mul_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { f64x2_mul(a.into(), b.into()).simd_into(self) } #[inline(always)] fn div_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { f64x2_div(a.into(), b.into()).simd_into(self) } #[inline(always)] fn copysign_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { let sign_mask = f64x2_splat(-0.0_f64); let sign_bits = v128_and(b.into(), sign_mask.into()); let magnitude = v128_andnot(a.into(), sign_mask.into()); v128_or(magnitude, sign_bits).simd_into(self) } #[inline(always)] fn simd_eq_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { f64x2_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_lt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { f64x2_lt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_le_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { f64x2_le(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_ge_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { f64x2_ge(a.into(), b.into()).simd_into(self) } #[inline(always)] fn simd_gt_f64x2(self, a: f64x2, b: f64x2) -> mask64x2 { f64x2_gt(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn zip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_low_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { u64x2_shuffle::<0, 2>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn unzip_high_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { u64x2_shuffle::<1, 3>(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { f64x2_max(a.into(), b.into()).simd_into(self) } #[inline(always)] fn max_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { f64x2_pmax(b.into(), a.into()).simd_into(self) } #[inline(always)] fn min_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { f64x2_min(a.into(), b.into()).simd_into(self) } #[inline(always)] fn min_precise_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { f64x2_pmin(b.into(), a.into()).simd_into(self) } #[inline(always)] fn madd_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { a.mul(b).add(c) } #[inline(always)] fn msub_f64x2(self, a: f64x2, b: f64x2, c: f64x2) -> f64x2 { a.mul(b).sub(c) } #[inline(always)] fn floor_f64x2(self, a: f64x2) -> f64x2 { f64x2_floor(a.into()).simd_into(self) } #[inline(always)] fn fract_f64x2(self, a: f64x2) -> f64x2 { a.sub(a.trunc()) } #[inline(always)] fn trunc_f64x2(self, a: f64x2) -> f64x2 { f64x2_trunc(a.into()).simd_into(self) } #[inline(always)] fn select_f64x2(self, a: mask64x2, b: f64x2, c: f64x2) -> f64x2 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn combine_f64x2(self, a: f64x2, b: f64x2) -> f64x4 { let mut result = [0.0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn reinterpret_f32_f64x2(self, a: f64x2) -> f32x4 { ::from(a).simd_into(self) } #[inline(always)] fn splat_mask64x2(self, val: i64) -> mask64x2 { i64x2_splat(val).simd_into(self) } #[inline(always)] fn not_mask64x2(self, a: mask64x2) -> mask64x2 { v128_not(a.into()).simd_into(self) } #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { v128_and(a.into(), b.into()).simd_into(self) } #[inline(always)] fn or_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { v128_or(a.into(), b.into()).simd_into(self) } #[inline(always)] fn xor_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { v128_xor(a.into(), b.into()).simd_into(self) } #[inline(always)] fn select_mask64x2( self, a: mask64x2, b: mask64x2, c: mask64x2, ) -> mask64x2 { v128_bitselect(b.into(), c.into(), a.into()).simd_into(self) } #[inline(always)] fn simd_eq_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { i64x2_eq(a.into(), b.into()).simd_into(self) } #[inline(always)] fn combine_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x4 { let mut result = [0; 4usize]; result[0..2usize].copy_from_slice(&a.val); result[2usize..4usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn splat_f32x8(self, a: f32) -> f32x8 { let half = self.splat_f32x4(a); self.combine_f32x4(half, half) } #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) } #[inline(always)] fn neg_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.neg_f32x4(a0), self.neg_f32x4(a1)) } #[inline(always)] fn sqrt_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.sqrt_f32x4(a0), self.sqrt_f32x4(a1)) } #[inline(always)] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.add_f32x4(a0, b0), self.add_f32x4(a1, b1)) } #[inline(always)] fn sub_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.sub_f32x4(a0, b0), self.sub_f32x4(a1, b1)) } #[inline(always)] fn mul_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.mul_f32x4(a0, b0), self.mul_f32x4(a1, b1)) } #[inline(always)] fn div_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.div_f32x4(a0, b0), self.div_f32x4(a1, b1)) } #[inline(always)] fn copysign_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.copysign_f32x4(a0, b0), self.copysign_f32x4(a1, b1)) } #[inline(always)] fn simd_eq_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_eq_f32x4(a0, b0), self.simd_eq_f32x4(a1, b1)) } #[inline(always)] fn simd_lt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_lt_f32x4(a0, b0), self.simd_lt_f32x4(a1, b1)) } #[inline(always)] fn simd_le_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_le_f32x4(a0, b0), self.simd_le_f32x4(a1, b1)) } #[inline(always)] fn simd_ge_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_ge_f32x4(a0, b0), self.simd_ge_f32x4(a1, b1)) } #[inline(always)] fn simd_gt_f32x8(self, a: f32x8, b: f32x8) -> mask32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_mask32x4(self.simd_gt_f32x4(a0, b0), self.simd_gt_f32x4(a1, b1)) } #[inline(always)] fn zip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, _) = self.split_f32x8(a); let (b0, _) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a0, b0), self.zip_high_f32x4(a0, b0)) } #[inline(always)] fn zip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (_, a1) = self.split_f32x8(a); let (_, b1) = self.split_f32x8(b); self.combine_f32x4(self.zip_low_f32x4(a1, b1), self.zip_high_f32x4(a1, b1)) } #[inline(always)] fn unzip_low_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_low_f32x4(a0, a1), self.unzip_low_f32x4(b0, b1)) } #[inline(always)] fn unzip_high_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.unzip_high_f32x4(a0, a1), self.unzip_high_f32x4(b0, b1)) } #[inline(always)] fn max_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.max_f32x4(a0, b0), self.max_f32x4(a1, b1)) } #[inline(always)] fn max_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.max_precise_f32x4(a0, b0), self.max_precise_f32x4(a1, b1), ) } #[inline(always)] fn min_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4(self.min_f32x4(a0, b0), self.min_f32x4(a1, b1)) } #[inline(always)] fn min_precise_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); self.combine_f32x4( self.min_precise_f32x4(a0, b0), self.min_precise_f32x4(a1, b1), ) } #[inline(always)] fn madd_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.madd_f32x4(a0, b0, c0), self.madd_f32x4(a1, b1, c1)) } #[inline(always)] fn msub_f32x8(self, a: f32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.msub_f32x4(a0, b0, c0), self.msub_f32x4(a1, b1, c1)) } #[inline(always)] fn floor_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1)) } #[inline(always)] fn fract_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.fract_f32x4(a0), self.fract_f32x4(a1)) } #[inline(always)] fn trunc_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.trunc_f32x4(a0), self.trunc_f32x4(a1)) } #[inline(always)] fn select_f32x8(self, a: mask32x8, b: f32x8, c: f32x8) -> f32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_f32x8(b); let (c0, c1) = self.split_f32x8(c); self.combine_f32x4(self.select_f32x4(a0, b0, c0), self.select_f32x4(a1, b1, c1)) } #[inline(always)] fn combine_f32x8(self, a: f32x8, b: f32x8) -> f32x16 { let mut result = [0.0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f32x8(self, a: f32x8) -> (f32x4, f32x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x8(self, a: f32x8) -> f64x4 { let (a0, a1) = self.split_f32x8(a); self.combine_f64x2( self.reinterpret_f64_f32x4(a0), self.reinterpret_f64_f32x4(a1), ) } #[inline(always)] fn reinterpret_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4( self.reinterpret_i32_f32x4(a0), self.reinterpret_i32_f32x4(a1), ) } #[inline(always)] fn reinterpret_u8_f32x8(self, a: f32x8) -> u8x32 { let (a0, a1) = self.split_f32x8(a); self.combine_u8x16(self.reinterpret_u8_f32x4(a0), self.reinterpret_u8_f32x4(a1)) } #[inline(always)] fn reinterpret_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4( self.reinterpret_u32_f32x4(a0), self.reinterpret_u32_f32x4(a1), ) } #[inline(always)] fn cvt_u32_f32x8(self, a: f32x8) -> u32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_u32x4(self.cvt_u32_f32x4(a0), self.cvt_u32_f32x4(a1)) } #[inline(always)] fn cvt_i32_f32x8(self, a: f32x8) -> i32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_i32x4(self.cvt_i32_f32x4(a0), self.cvt_i32_f32x4(a1)) } #[inline(always)] fn splat_i8x32(self, a: i8) -> i8x32 { let half = self.splat_i8x16(a); self.combine_i8x16(half, half) } #[inline(always)] fn not_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.not_i8x16(a0), self.not_i8x16(a1)) } #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.add_i8x16(a0, b0), self.add_i8x16(a1, b1)) } #[inline(always)] fn sub_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.sub_i8x16(a0, b0), self.sub_i8x16(a1, b1)) } #[inline(always)] fn mul_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.mul_i8x16(a0, b0), self.mul_i8x16(a1, b1)) } #[inline(always)] fn and_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.and_i8x16(a0, b0), self.and_i8x16(a1, b1)) } #[inline(always)] fn or_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.or_i8x16(a0, b0), self.or_i8x16(a1, b1)) } #[inline(always)] fn xor_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.xor_i8x16(a0, b0), self.xor_i8x16(a1, b1)) } #[inline(always)] fn shr_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shr_i8x16(a0, b), self.shr_i8x16(a1, b)) } #[inline(always)] fn shrv_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.shrv_i8x16(a0, b0), self.shrv_i8x16(a1, b1)) } #[inline(always)] fn shl_i8x32(self, a: i8x32, b: u32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.shl_i8x16(a0, b), self.shl_i8x16(a1, b)) } #[inline(always)] fn simd_eq_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_eq_i8x16(a0, b0), self.simd_eq_i8x16(a1, b1)) } #[inline(always)] fn simd_lt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_lt_i8x16(a0, b0), self.simd_lt_i8x16(a1, b1)) } #[inline(always)] fn simd_le_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_le_i8x16(a0, b0), self.simd_le_i8x16(a1, b1)) } #[inline(always)] fn simd_ge_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_ge_i8x16(a0, b0), self.simd_ge_i8x16(a1, b1)) } #[inline(always)] fn simd_gt_i8x32(self, a: i8x32, b: i8x32) -> mask8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_mask8x16(self.simd_gt_i8x16(a0, b0), self.simd_gt_i8x16(a1, b1)) } #[inline(always)] fn zip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, _) = self.split_i8x32(a); let (b0, _) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a0, b0), self.zip_high_i8x16(a0, b0)) } #[inline(always)] fn zip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (_, a1) = self.split_i8x32(a); let (_, b1) = self.split_i8x32(b); self.combine_i8x16(self.zip_low_i8x16(a1, b1), self.zip_high_i8x16(a1, b1)) } #[inline(always)] fn unzip_low_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_low_i8x16(a0, a1), self.unzip_low_i8x16(b0, b1)) } #[inline(always)] fn unzip_high_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.unzip_high_i8x16(a0, a1), self.unzip_high_i8x16(b0, b1)) } #[inline(always)] fn select_i8x32(self, a: mask8x32, b: i8x32, c: i8x32) -> i8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_i8x32(b); let (c0, c1) = self.split_i8x32(c); self.combine_i8x16(self.select_i8x16(a0, b0, c0), self.select_i8x16(a1, b1, c1)) } #[inline(always)] fn min_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.min_i8x16(a0, b0), self.min_i8x16(a1, b1)) } #[inline(always)] fn max_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); self.combine_i8x16(self.max_i8x16(a0, b0), self.max_i8x16(a1, b1)) } #[inline(always)] fn combine_i8x32(self, a: i8x32, b: i8x32) -> i8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i8x32(self, a: i8x32) -> (i8x16, i8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x32(self, a: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_i8x16(self.neg_i8x16(a0), self.neg_i8x16(a1)) } #[inline(always)] fn reinterpret_u8_i8x32(self, a: i8x32) -> u8x32 { let (a0, a1) = self.split_i8x32(a); self.combine_u8x16(self.reinterpret_u8_i8x16(a0), self.reinterpret_u8_i8x16(a1)) } #[inline(always)] fn reinterpret_u32_i8x32(self, a: i8x32) -> u32x8 { let (a0, a1) = self.split_i8x32(a); self.combine_u32x4( self.reinterpret_u32_i8x16(a0), self.reinterpret_u32_i8x16(a1), ) } #[inline(always)] fn splat_u8x32(self, a: u8) -> u8x32 { let half = self.splat_u8x16(a); self.combine_u8x16(half, half) } #[inline(always)] fn not_u8x32(self, a: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.not_u8x16(a0), self.not_u8x16(a1)) } #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.add_u8x16(a0, b0), self.add_u8x16(a1, b1)) } #[inline(always)] fn sub_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.sub_u8x16(a0, b0), self.sub_u8x16(a1, b1)) } #[inline(always)] fn mul_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.mul_u8x16(a0, b0), self.mul_u8x16(a1, b1)) } #[inline(always)] fn and_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.and_u8x16(a0, b0), self.and_u8x16(a1, b1)) } #[inline(always)] fn or_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.or_u8x16(a0, b0), self.or_u8x16(a1, b1)) } #[inline(always)] fn xor_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.xor_u8x16(a0, b0), self.xor_u8x16(a1, b1)) } #[inline(always)] fn shr_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shr_u8x16(a0, b), self.shr_u8x16(a1, b)) } #[inline(always)] fn shrv_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.shrv_u8x16(a0, b0), self.shrv_u8x16(a1, b1)) } #[inline(always)] fn shl_u8x32(self, a: u8x32, b: u32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u8x16(self.shl_u8x16(a0, b), self.shl_u8x16(a1, b)) } #[inline(always)] fn simd_eq_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_eq_u8x16(a0, b0), self.simd_eq_u8x16(a1, b1)) } #[inline(always)] fn simd_lt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_lt_u8x16(a0, b0), self.simd_lt_u8x16(a1, b1)) } #[inline(always)] fn simd_le_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_le_u8x16(a0, b0), self.simd_le_u8x16(a1, b1)) } #[inline(always)] fn simd_ge_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_ge_u8x16(a0, b0), self.simd_ge_u8x16(a1, b1)) } #[inline(always)] fn simd_gt_u8x32(self, a: u8x32, b: u8x32) -> mask8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_mask8x16(self.simd_gt_u8x16(a0, b0), self.simd_gt_u8x16(a1, b1)) } #[inline(always)] fn zip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, _) = self.split_u8x32(a); let (b0, _) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a0, b0), self.zip_high_u8x16(a0, b0)) } #[inline(always)] fn zip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (_, a1) = self.split_u8x32(a); let (_, b1) = self.split_u8x32(b); self.combine_u8x16(self.zip_low_u8x16(a1, b1), self.zip_high_u8x16(a1, b1)) } #[inline(always)] fn unzip_low_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_low_u8x16(a0, a1), self.unzip_low_u8x16(b0, b1)) } #[inline(always)] fn unzip_high_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.unzip_high_u8x16(a0, a1), self.unzip_high_u8x16(b0, b1)) } #[inline(always)] fn select_u8x32(self, a: mask8x32, b: u8x32, c: u8x32) -> u8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_u8x32(b); let (c0, c1) = self.split_u8x32(c); self.combine_u8x16(self.select_u8x16(a0, b0, c0), self.select_u8x16(a1, b1, c1)) } #[inline(always)] fn min_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.min_u8x16(a0, b0), self.min_u8x16(a1, b1)) } #[inline(always)] fn max_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); self.combine_u8x16(self.max_u8x16(a0, b0), self.max_u8x16(a1, b1)) } #[inline(always)] fn combine_u8x32(self, a: u8x32, b: u8x32) -> u8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u8x32(self, a: u8x32) -> (u8x16, u8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn widen_u8x32(self, a: u8x32) -> u16x32 { let (a0, a1) = self.split_u8x32(a); self.combine_u16x16(self.widen_u8x16(a0), self.widen_u8x16(a1)) } #[inline(always)] fn reinterpret_u32_u8x32(self, a: u8x32) -> u32x8 { let (a0, a1) = self.split_u8x32(a); self.combine_u32x4( self.reinterpret_u32_u8x16(a0), self.reinterpret_u32_u8x16(a1), ) } #[inline(always)] fn splat_mask8x32(self, a: i8) -> mask8x32 { let half = self.splat_mask8x16(a); self.combine_mask8x16(half, half) } #[inline(always)] fn not_mask8x32(self, a: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); self.combine_mask8x16(self.not_mask8x16(a0), self.not_mask8x16(a1)) } #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.and_mask8x16(a0, b0), self.and_mask8x16(a1, b1)) } #[inline(always)] fn or_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.or_mask8x16(a0, b0), self.or_mask8x16(a1, b1)) } #[inline(always)] fn xor_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.xor_mask8x16(a0, b0), self.xor_mask8x16(a1, b1)) } #[inline(always)] fn select_mask8x32( self, a: mask8x32, b: mask8x32, c: mask8x32, ) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); let (c0, c1) = self.split_mask8x32(c); self.combine_mask8x16( self.select_mask8x16(a0, b0, c0), self.select_mask8x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); self.combine_mask8x16(self.simd_eq_mask8x16(a0, b0), self.simd_eq_mask8x16(a1, b1)) } #[inline(always)] fn combine_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x64 { let mut result = [0; 64usize]; result[0..32usize].copy_from_slice(&a.val); result[32usize..64usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask8x32(self, a: mask8x32) -> (mask8x16, mask8x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x16(self, a: i16) -> i16x16 { let half = self.splat_i16x8(a); self.combine_i16x8(half, half) } #[inline(always)] fn not_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.not_i16x8(a0), self.not_i16x8(a1)) } #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.add_i16x8(a0, b0), self.add_i16x8(a1, b1)) } #[inline(always)] fn sub_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.sub_i16x8(a0, b0), self.sub_i16x8(a1, b1)) } #[inline(always)] fn mul_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.mul_i16x8(a0, b0), self.mul_i16x8(a1, b1)) } #[inline(always)] fn and_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.and_i16x8(a0, b0), self.and_i16x8(a1, b1)) } #[inline(always)] fn or_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.or_i16x8(a0, b0), self.or_i16x8(a1, b1)) } #[inline(always)] fn xor_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.xor_i16x8(a0, b0), self.xor_i16x8(a1, b1)) } #[inline(always)] fn shr_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shr_i16x8(a0, b), self.shr_i16x8(a1, b)) } #[inline(always)] fn shrv_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.shrv_i16x8(a0, b0), self.shrv_i16x8(a1, b1)) } #[inline(always)] fn shl_i16x16(self, a: i16x16, b: u32) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.shl_i16x8(a0, b), self.shl_i16x8(a1, b)) } #[inline(always)] fn simd_eq_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_eq_i16x8(a0, b0), self.simd_eq_i16x8(a1, b1)) } #[inline(always)] fn simd_lt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_lt_i16x8(a0, b0), self.simd_lt_i16x8(a1, b1)) } #[inline(always)] fn simd_le_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_le_i16x8(a0, b0), self.simd_le_i16x8(a1, b1)) } #[inline(always)] fn simd_ge_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_ge_i16x8(a0, b0), self.simd_ge_i16x8(a1, b1)) } #[inline(always)] fn simd_gt_i16x16(self, a: i16x16, b: i16x16) -> mask16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_mask16x8(self.simd_gt_i16x8(a0, b0), self.simd_gt_i16x8(a1, b1)) } #[inline(always)] fn zip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, _) = self.split_i16x16(a); let (b0, _) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a0, b0), self.zip_high_i16x8(a0, b0)) } #[inline(always)] fn zip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (_, a1) = self.split_i16x16(a); let (_, b1) = self.split_i16x16(b); self.combine_i16x8(self.zip_low_i16x8(a1, b1), self.zip_high_i16x8(a1, b1)) } #[inline(always)] fn unzip_low_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_low_i16x8(a0, a1), self.unzip_low_i16x8(b0, b1)) } #[inline(always)] fn unzip_high_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.unzip_high_i16x8(a0, a1), self.unzip_high_i16x8(b0, b1)) } #[inline(always)] fn select_i16x16(self, a: mask16x16, b: i16x16, c: i16x16) -> i16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_i16x16(b); let (c0, c1) = self.split_i16x16(c); self.combine_i16x8(self.select_i16x8(a0, b0, c0), self.select_i16x8(a1, b1, c1)) } #[inline(always)] fn min_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.min_i16x8(a0, b0), self.min_i16x8(a1, b1)) } #[inline(always)] fn max_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); self.combine_i16x8(self.max_i16x8(a0, b0), self.max_i16x8(a1, b1)) } #[inline(always)] fn combine_i16x16(self, a: i16x16, b: i16x16) -> i16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i16x16(self, a: i16x16) -> (i16x8, i16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x16(self, a: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); self.combine_i16x8(self.neg_i16x8(a0), self.neg_i16x8(a1)) } #[inline(always)] fn reinterpret_u8_i16x16(self, a: i16x16) -> u8x32 { let (a0, a1) = self.split_i16x16(a); self.combine_u8x16(self.reinterpret_u8_i16x8(a0), self.reinterpret_u8_i16x8(a1)) } #[inline(always)] fn reinterpret_u32_i16x16(self, a: i16x16) -> u32x8 { let (a0, a1) = self.split_i16x16(a); self.combine_u32x4( self.reinterpret_u32_i16x8(a0), self.reinterpret_u32_i16x8(a1), ) } #[inline(always)] fn splat_u16x16(self, a: u16) -> u16x16 { let half = self.splat_u16x8(a); self.combine_u16x8(half, half) } #[inline(always)] fn not_u16x16(self, a: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.not_u16x8(a0), self.not_u16x8(a1)) } #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.add_u16x8(a0, b0), self.add_u16x8(a1, b1)) } #[inline(always)] fn sub_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.sub_u16x8(a0, b0), self.sub_u16x8(a1, b1)) } #[inline(always)] fn mul_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.mul_u16x8(a0, b0), self.mul_u16x8(a1, b1)) } #[inline(always)] fn and_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.and_u16x8(a0, b0), self.and_u16x8(a1, b1)) } #[inline(always)] fn or_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.or_u16x8(a0, b0), self.or_u16x8(a1, b1)) } #[inline(always)] fn xor_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.xor_u16x8(a0, b0), self.xor_u16x8(a1, b1)) } #[inline(always)] fn shr_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shr_u16x8(a0, b), self.shr_u16x8(a1, b)) } #[inline(always)] fn shrv_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.shrv_u16x8(a0, b0), self.shrv_u16x8(a1, b1)) } #[inline(always)] fn shl_u16x16(self, a: u16x16, b: u32) -> u16x16 { let (a0, a1) = self.split_u16x16(a); self.combine_u16x8(self.shl_u16x8(a0, b), self.shl_u16x8(a1, b)) } #[inline(always)] fn simd_eq_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_eq_u16x8(a0, b0), self.simd_eq_u16x8(a1, b1)) } #[inline(always)] fn simd_lt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_lt_u16x8(a0, b0), self.simd_lt_u16x8(a1, b1)) } #[inline(always)] fn simd_le_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_le_u16x8(a0, b0), self.simd_le_u16x8(a1, b1)) } #[inline(always)] fn simd_ge_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_ge_u16x8(a0, b0), self.simd_ge_u16x8(a1, b1)) } #[inline(always)] fn simd_gt_u16x16(self, a: u16x16, b: u16x16) -> mask16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_mask16x8(self.simd_gt_u16x8(a0, b0), self.simd_gt_u16x8(a1, b1)) } #[inline(always)] fn zip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, _) = self.split_u16x16(a); let (b0, _) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a0, b0), self.zip_high_u16x8(a0, b0)) } #[inline(always)] fn zip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (_, a1) = self.split_u16x16(a); let (_, b1) = self.split_u16x16(b); self.combine_u16x8(self.zip_low_u16x8(a1, b1), self.zip_high_u16x8(a1, b1)) } #[inline(always)] fn unzip_low_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_low_u16x8(a0, a1), self.unzip_low_u16x8(b0, b1)) } #[inline(always)] fn unzip_high_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.unzip_high_u16x8(a0, a1), self.unzip_high_u16x8(b0, b1)) } #[inline(always)] fn select_u16x16(self, a: mask16x16, b: u16x16, c: u16x16) -> u16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_u16x16(b); let (c0, c1) = self.split_u16x16(c); self.combine_u16x8(self.select_u16x8(a0, b0, c0), self.select_u16x8(a1, b1, c1)) } #[inline(always)] fn min_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.min_u16x8(a0, b0), self.min_u16x8(a1, b1)) } #[inline(always)] fn max_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); self.combine_u16x8(self.max_u16x8(a0, b0), self.max_u16x8(a1, b1)) } #[inline(always)] fn combine_u16x16(self, a: u16x16, b: u16x16) -> u16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u16x16(self, a: u16x16) -> (u16x8, u16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn narrow_u16x16(self, a: u16x16) -> u8x16 { let mask = u16x8_splat(0xFF); let (low, high) = self.split_u16x16(a); let low_masked = v128_and(low.into(), mask); let high_masked = v128_and(high.into(), mask); let result = u8x16_narrow_i16x8(low_masked, high_masked); result.simd_into(self) } #[inline(always)] fn reinterpret_u8_u16x16(self, a: u16x16) -> u8x32 { let (a0, a1) = self.split_u16x16(a); self.combine_u8x16(self.reinterpret_u8_u16x8(a0), self.reinterpret_u8_u16x8(a1)) } #[inline(always)] fn reinterpret_u32_u16x16(self, a: u16x16) -> u32x8 { let (a0, a1) = self.split_u16x16(a); self.combine_u32x4( self.reinterpret_u32_u16x8(a0), self.reinterpret_u32_u16x8(a1), ) } #[inline(always)] fn splat_mask16x16(self, a: i16) -> mask16x16 { let half = self.splat_mask16x8(a); self.combine_mask16x8(half, half) } #[inline(always)] fn not_mask16x16(self, a: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); self.combine_mask16x8(self.not_mask16x8(a0), self.not_mask16x8(a1)) } #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.and_mask16x8(a0, b0), self.and_mask16x8(a1, b1)) } #[inline(always)] fn or_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.or_mask16x8(a0, b0), self.or_mask16x8(a1, b1)) } #[inline(always)] fn xor_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.xor_mask16x8(a0, b0), self.xor_mask16x8(a1, b1)) } #[inline(always)] fn select_mask16x16( self, a: mask16x16, b: mask16x16, c: mask16x16, ) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); let (c0, c1) = self.split_mask16x16(c); self.combine_mask16x8( self.select_mask16x8(a0, b0, c0), self.select_mask16x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); self.combine_mask16x8(self.simd_eq_mask16x8(a0, b0), self.simd_eq_mask16x8(a1, b1)) } #[inline(always)] fn combine_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x32 { let mut result = [0; 32usize]; result[0..16usize].copy_from_slice(&a.val); result[16usize..32usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask16x16(self, a: mask16x16) -> (mask16x8, mask16x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x8(self, a: i32) -> i32x8 { let half = self.splat_i32x4(a); self.combine_i32x4(half, half) } #[inline(always)] fn not_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.not_i32x4(a0), self.not_i32x4(a1)) } #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.add_i32x4(a0, b0), self.add_i32x4(a1, b1)) } #[inline(always)] fn sub_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.sub_i32x4(a0, b0), self.sub_i32x4(a1, b1)) } #[inline(always)] fn mul_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.mul_i32x4(a0, b0), self.mul_i32x4(a1, b1)) } #[inline(always)] fn and_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.and_i32x4(a0, b0), self.and_i32x4(a1, b1)) } #[inline(always)] fn or_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.or_i32x4(a0, b0), self.or_i32x4(a1, b1)) } #[inline(always)] fn xor_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.xor_i32x4(a0, b0), self.xor_i32x4(a1, b1)) } #[inline(always)] fn shr_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shr_i32x4(a0, b), self.shr_i32x4(a1, b)) } #[inline(always)] fn shrv_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.shrv_i32x4(a0, b0), self.shrv_i32x4(a1, b1)) } #[inline(always)] fn shl_i32x8(self, a: i32x8, b: u32) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.shl_i32x4(a0, b), self.shl_i32x4(a1, b)) } #[inline(always)] fn simd_eq_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_eq_i32x4(a0, b0), self.simd_eq_i32x4(a1, b1)) } #[inline(always)] fn simd_lt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_lt_i32x4(a0, b0), self.simd_lt_i32x4(a1, b1)) } #[inline(always)] fn simd_le_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_le_i32x4(a0, b0), self.simd_le_i32x4(a1, b1)) } #[inline(always)] fn simd_ge_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_ge_i32x4(a0, b0), self.simd_ge_i32x4(a1, b1)) } #[inline(always)] fn simd_gt_i32x8(self, a: i32x8, b: i32x8) -> mask32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_mask32x4(self.simd_gt_i32x4(a0, b0), self.simd_gt_i32x4(a1, b1)) } #[inline(always)] fn zip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, _) = self.split_i32x8(a); let (b0, _) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a0, b0), self.zip_high_i32x4(a0, b0)) } #[inline(always)] fn zip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (_, a1) = self.split_i32x8(a); let (_, b1) = self.split_i32x8(b); self.combine_i32x4(self.zip_low_i32x4(a1, b1), self.zip_high_i32x4(a1, b1)) } #[inline(always)] fn unzip_low_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_low_i32x4(a0, a1), self.unzip_low_i32x4(b0, b1)) } #[inline(always)] fn unzip_high_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.unzip_high_i32x4(a0, a1), self.unzip_high_i32x4(b0, b1)) } #[inline(always)] fn select_i32x8(self, a: mask32x8, b: i32x8, c: i32x8) -> i32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_i32x8(b); let (c0, c1) = self.split_i32x8(c); self.combine_i32x4(self.select_i32x4(a0, b0, c0), self.select_i32x4(a1, b1, c1)) } #[inline(always)] fn min_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.min_i32x4(a0, b0), self.min_i32x4(a1, b1)) } #[inline(always)] fn max_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); self.combine_i32x4(self.max_i32x4(a0, b0), self.max_i32x4(a1, b1)) } #[inline(always)] fn combine_i32x8(self, a: i32x8, b: i32x8) -> i32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_i32x8(self, a: i32x8) -> (i32x4, i32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x8(self, a: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_i32x4(self.neg_i32x4(a0), self.neg_i32x4(a1)) } #[inline(always)] fn reinterpret_u8_i32x8(self, a: i32x8) -> u8x32 { let (a0, a1) = self.split_i32x8(a); self.combine_u8x16(self.reinterpret_u8_i32x4(a0), self.reinterpret_u8_i32x4(a1)) } #[inline(always)] fn reinterpret_u32_i32x8(self, a: i32x8) -> u32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_u32x4( self.reinterpret_u32_i32x4(a0), self.reinterpret_u32_i32x4(a1), ) } #[inline(always)] fn cvt_f32_i32x8(self, a: i32x8) -> f32x8 { let (a0, a1) = self.split_i32x8(a); self.combine_f32x4(self.cvt_f32_i32x4(a0), self.cvt_f32_i32x4(a1)) } #[inline(always)] fn splat_u32x8(self, a: u32) -> u32x8 { let half = self.splat_u32x4(a); self.combine_u32x4(half, half) } #[inline(always)] fn not_u32x8(self, a: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.not_u32x4(a0), self.not_u32x4(a1)) } #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.add_u32x4(a0, b0), self.add_u32x4(a1, b1)) } #[inline(always)] fn sub_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.sub_u32x4(a0, b0), self.sub_u32x4(a1, b1)) } #[inline(always)] fn mul_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.mul_u32x4(a0, b0), self.mul_u32x4(a1, b1)) } #[inline(always)] fn and_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.and_u32x4(a0, b0), self.and_u32x4(a1, b1)) } #[inline(always)] fn or_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.or_u32x4(a0, b0), self.or_u32x4(a1, b1)) } #[inline(always)] fn xor_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.xor_u32x4(a0, b0), self.xor_u32x4(a1, b1)) } #[inline(always)] fn shr_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shr_u32x4(a0, b), self.shr_u32x4(a1, b)) } #[inline(always)] fn shrv_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.shrv_u32x4(a0, b0), self.shrv_u32x4(a1, b1)) } #[inline(always)] fn shl_u32x8(self, a: u32x8, b: u32) -> u32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_u32x4(self.shl_u32x4(a0, b), self.shl_u32x4(a1, b)) } #[inline(always)] fn simd_eq_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_eq_u32x4(a0, b0), self.simd_eq_u32x4(a1, b1)) } #[inline(always)] fn simd_lt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_lt_u32x4(a0, b0), self.simd_lt_u32x4(a1, b1)) } #[inline(always)] fn simd_le_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_le_u32x4(a0, b0), self.simd_le_u32x4(a1, b1)) } #[inline(always)] fn simd_ge_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_ge_u32x4(a0, b0), self.simd_ge_u32x4(a1, b1)) } #[inline(always)] fn simd_gt_u32x8(self, a: u32x8, b: u32x8) -> mask32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_mask32x4(self.simd_gt_u32x4(a0, b0), self.simd_gt_u32x4(a1, b1)) } #[inline(always)] fn zip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, _) = self.split_u32x8(a); let (b0, _) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a0, b0), self.zip_high_u32x4(a0, b0)) } #[inline(always)] fn zip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (_, a1) = self.split_u32x8(a); let (_, b1) = self.split_u32x8(b); self.combine_u32x4(self.zip_low_u32x4(a1, b1), self.zip_high_u32x4(a1, b1)) } #[inline(always)] fn unzip_low_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_low_u32x4(a0, a1), self.unzip_low_u32x4(b0, b1)) } #[inline(always)] fn unzip_high_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.unzip_high_u32x4(a0, a1), self.unzip_high_u32x4(b0, b1)) } #[inline(always)] fn select_u32x8(self, a: mask32x8, b: u32x8, c: u32x8) -> u32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_u32x8(b); let (c0, c1) = self.split_u32x8(c); self.combine_u32x4(self.select_u32x4(a0, b0, c0), self.select_u32x4(a1, b1, c1)) } #[inline(always)] fn min_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.min_u32x4(a0, b0), self.min_u32x4(a1, b1)) } #[inline(always)] fn max_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); self.combine_u32x4(self.max_u32x4(a0, b0), self.max_u32x4(a1, b1)) } #[inline(always)] fn combine_u32x8(self, a: u32x8, b: u32x8) -> u32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_u32x8(self, a: u32x8) -> (u32x4, u32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_u8_u32x8(self, a: u32x8) -> u8x32 { let (a0, a1) = self.split_u32x8(a); self.combine_u8x16(self.reinterpret_u8_u32x4(a0), self.reinterpret_u8_u32x4(a1)) } #[inline(always)] fn cvt_f32_u32x8(self, a: u32x8) -> f32x8 { let (a0, a1) = self.split_u32x8(a); self.combine_f32x4(self.cvt_f32_u32x4(a0), self.cvt_f32_u32x4(a1)) } #[inline(always)] fn splat_mask32x8(self, a: i32) -> mask32x8 { let half = self.splat_mask32x4(a); self.combine_mask32x4(half, half) } #[inline(always)] fn not_mask32x8(self, a: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); self.combine_mask32x4(self.not_mask32x4(a0), self.not_mask32x4(a1)) } #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.and_mask32x4(a0, b0), self.and_mask32x4(a1, b1)) } #[inline(always)] fn or_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.or_mask32x4(a0, b0), self.or_mask32x4(a1, b1)) } #[inline(always)] fn xor_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.xor_mask32x4(a0, b0), self.xor_mask32x4(a1, b1)) } #[inline(always)] fn select_mask32x8( self, a: mask32x8, b: mask32x8, c: mask32x8, ) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); let (c0, c1) = self.split_mask32x8(c); self.combine_mask32x4( self.select_mask32x4(a0, b0, c0), self.select_mask32x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); self.combine_mask32x4(self.simd_eq_mask32x4(a0, b0), self.simd_eq_mask32x4(a1, b1)) } #[inline(always)] fn combine_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x16 { let mut result = [0; 16usize]; result[0..8usize].copy_from_slice(&a.val); result[8usize..16usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask32x8(self, a: mask32x8) -> (mask32x4, mask32x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x4(self, a: f64) -> f64x4 { let half = self.splat_f64x2(a); self.combine_f64x2(half, half) } #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) } #[inline(always)] fn neg_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.neg_f64x2(a0), self.neg_f64x2(a1)) } #[inline(always)] fn sqrt_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.sqrt_f64x2(a0), self.sqrt_f64x2(a1)) } #[inline(always)] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.add_f64x2(a0, b0), self.add_f64x2(a1, b1)) } #[inline(always)] fn sub_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.sub_f64x2(a0, b0), self.sub_f64x2(a1, b1)) } #[inline(always)] fn mul_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.mul_f64x2(a0, b0), self.mul_f64x2(a1, b1)) } #[inline(always)] fn div_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.div_f64x2(a0, b0), self.div_f64x2(a1, b1)) } #[inline(always)] fn copysign_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.copysign_f64x2(a0, b0), self.copysign_f64x2(a1, b1)) } #[inline(always)] fn simd_eq_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_eq_f64x2(a0, b0), self.simd_eq_f64x2(a1, b1)) } #[inline(always)] fn simd_lt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_lt_f64x2(a0, b0), self.simd_lt_f64x2(a1, b1)) } #[inline(always)] fn simd_le_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_le_f64x2(a0, b0), self.simd_le_f64x2(a1, b1)) } #[inline(always)] fn simd_ge_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_ge_f64x2(a0, b0), self.simd_ge_f64x2(a1, b1)) } #[inline(always)] fn simd_gt_f64x4(self, a: f64x4, b: f64x4) -> mask64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_mask64x2(self.simd_gt_f64x2(a0, b0), self.simd_gt_f64x2(a1, b1)) } #[inline(always)] fn zip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, _) = self.split_f64x4(a); let (b0, _) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a0, b0), self.zip_high_f64x2(a0, b0)) } #[inline(always)] fn zip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (_, a1) = self.split_f64x4(a); let (_, b1) = self.split_f64x4(b); self.combine_f64x2(self.zip_low_f64x2(a1, b1), self.zip_high_f64x2(a1, b1)) } #[inline(always)] fn unzip_low_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_low_f64x2(a0, a1), self.unzip_low_f64x2(b0, b1)) } #[inline(always)] fn unzip_high_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.unzip_high_f64x2(a0, a1), self.unzip_high_f64x2(b0, b1)) } #[inline(always)] fn max_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.max_f64x2(a0, b0), self.max_f64x2(a1, b1)) } #[inline(always)] fn max_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.max_precise_f64x2(a0, b0), self.max_precise_f64x2(a1, b1), ) } #[inline(always)] fn min_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2(self.min_f64x2(a0, b0), self.min_f64x2(a1, b1)) } #[inline(always)] fn min_precise_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); self.combine_f64x2( self.min_precise_f64x2(a0, b0), self.min_precise_f64x2(a1, b1), ) } #[inline(always)] fn madd_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.madd_f64x2(a0, b0, c0), self.madd_f64x2(a1, b1, c1)) } #[inline(always)] fn msub_f64x4(self, a: f64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.msub_f64x2(a0, b0, c0), self.msub_f64x2(a1, b1, c1)) } #[inline(always)] fn floor_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1)) } #[inline(always)] fn fract_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.fract_f64x2(a0), self.fract_f64x2(a1)) } #[inline(always)] fn trunc_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.trunc_f64x2(a0), self.trunc_f64x2(a1)) } #[inline(always)] fn select_f64x4(self, a: mask64x4, b: f64x4, c: f64x4) -> f64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_f64x4(b); let (c0, c1) = self.split_f64x4(c); self.combine_f64x2(self.select_f64x2(a0, b0, c0), self.select_f64x2(a1, b1, c1)) } #[inline(always)] fn combine_f64x4(self, a: f64x4, b: f64x4) -> f64x8 { let mut result = [0.0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_f64x4(self, a: f64x4) -> (f64x2, f64x2) { let mut b0 = [0.0; 2usize]; let mut b1 = [0.0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x4(self, a: f64x4) -> f32x8 { let (a0, a1) = self.split_f64x4(a); self.combine_f32x4( self.reinterpret_f32_f64x2(a0), self.reinterpret_f32_f64x2(a1), ) } #[inline(always)] fn splat_mask64x4(self, a: i64) -> mask64x4 { let half = self.splat_mask64x2(a); self.combine_mask64x2(half, half) } #[inline(always)] fn not_mask64x4(self, a: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); self.combine_mask64x2(self.not_mask64x2(a0), self.not_mask64x2(a1)) } #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.and_mask64x2(a0, b0), self.and_mask64x2(a1, b1)) } #[inline(always)] fn or_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.or_mask64x2(a0, b0), self.or_mask64x2(a1, b1)) } #[inline(always)] fn xor_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.xor_mask64x2(a0, b0), self.xor_mask64x2(a1, b1)) } #[inline(always)] fn select_mask64x4( self, a: mask64x4, b: mask64x4, c: mask64x4, ) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); let (c0, c1) = self.split_mask64x4(c); self.combine_mask64x2( self.select_mask64x2(a0, b0, c0), self.select_mask64x2(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); self.combine_mask64x2(self.simd_eq_mask64x2(a0, b0), self.simd_eq_mask64x2(a1, b1)) } #[inline(always)] fn combine_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x8 { let mut result = [0; 8usize]; result[0..4usize].copy_from_slice(&a.val); result[4usize..8usize].copy_from_slice(&b.val); result.simd_into(self) } #[inline(always)] fn split_mask64x4(self, a: mask64x4) -> (mask64x2, mask64x2) { let mut b0 = [0; 2usize]; let mut b1 = [0; 2usize]; b0.copy_from_slice(&a.val[0..2usize]); b1.copy_from_slice(&a.val[2usize..4usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f32x16(self, a: f32) -> f32x16 { let half = self.splat_f32x8(a); self.combine_f32x8(half, half) } #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) } #[inline(always)] fn neg_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.neg_f32x8(a0), self.neg_f32x8(a1)) } #[inline(always)] fn sqrt_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.sqrt_f32x8(a0), self.sqrt_f32x8(a1)) } #[inline(always)] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.add_f32x8(a0, b0), self.add_f32x8(a1, b1)) } #[inline(always)] fn sub_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.sub_f32x8(a0, b0), self.sub_f32x8(a1, b1)) } #[inline(always)] fn mul_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.mul_f32x8(a0, b0), self.mul_f32x8(a1, b1)) } #[inline(always)] fn div_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.div_f32x8(a0, b0), self.div_f32x8(a1, b1)) } #[inline(always)] fn copysign_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.copysign_f32x8(a0, b0), self.copysign_f32x8(a1, b1)) } #[inline(always)] fn simd_eq_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_eq_f32x8(a0, b0), self.simd_eq_f32x8(a1, b1)) } #[inline(always)] fn simd_lt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_lt_f32x8(a0, b0), self.simd_lt_f32x8(a1, b1)) } #[inline(always)] fn simd_le_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_le_f32x8(a0, b0), self.simd_le_f32x8(a1, b1)) } #[inline(always)] fn simd_ge_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_ge_f32x8(a0, b0), self.simd_ge_f32x8(a1, b1)) } #[inline(always)] fn simd_gt_f32x16(self, a: f32x16, b: f32x16) -> mask32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_mask32x8(self.simd_gt_f32x8(a0, b0), self.simd_gt_f32x8(a1, b1)) } #[inline(always)] fn zip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, _) = self.split_f32x16(a); let (b0, _) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a0, b0), self.zip_high_f32x8(a0, b0)) } #[inline(always)] fn zip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (_, a1) = self.split_f32x16(a); let (_, b1) = self.split_f32x16(b); self.combine_f32x8(self.zip_low_f32x8(a1, b1), self.zip_high_f32x8(a1, b1)) } #[inline(always)] fn unzip_low_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_low_f32x8(a0, a1), self.unzip_low_f32x8(b0, b1)) } #[inline(always)] fn unzip_high_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.unzip_high_f32x8(a0, a1), self.unzip_high_f32x8(b0, b1)) } #[inline(always)] fn max_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.max_f32x8(a0, b0), self.max_f32x8(a1, b1)) } #[inline(always)] fn max_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.max_precise_f32x8(a0, b0), self.max_precise_f32x8(a1, b1), ) } #[inline(always)] fn min_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8(self.min_f32x8(a0, b0), self.min_f32x8(a1, b1)) } #[inline(always)] fn min_precise_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); self.combine_f32x8( self.min_precise_f32x8(a0, b0), self.min_precise_f32x8(a1, b1), ) } #[inline(always)] fn madd_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.madd_f32x8(a0, b0, c0), self.madd_f32x8(a1, b1, c1)) } #[inline(always)] fn msub_f32x16(self, a: f32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.msub_f32x8(a0, b0, c0), self.msub_f32x8(a1, b1, c1)) } #[inline(always)] fn floor_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1)) } #[inline(always)] fn fract_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.fract_f32x8(a0), self.fract_f32x8(a1)) } #[inline(always)] fn trunc_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.trunc_f32x8(a0), self.trunc_f32x8(a1)) } #[inline(always)] fn select_f32x16(self, a: mask32x16, b: f32x16, c: f32x16) -> f32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_f32x16(b); let (c0, c1) = self.split_f32x16(c); self.combine_f32x8(self.select_f32x8(a0, b0, c0), self.select_f32x8(a1, b1, c1)) } #[inline(always)] fn split_f32x16(self, a: f32x16) -> (f32x8, f32x8) { let mut b0 = [0.0; 8usize]; let mut b1 = [0.0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8 { let (a0, a1) = self.split_f32x16(a); self.combine_f64x4( self.reinterpret_f64_f32x8(a0), self.reinterpret_f64_f32x8(a1), ) } #[inline(always)] fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8( self.reinterpret_i32_f32x8(a0), self.reinterpret_i32_f32x8(a1), ) } #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { let v0: v128 = unsafe { v128_load(src[0 * 4usize..].as_ptr() as *const v128) }; let v1: v128 = unsafe { v128_load(src[1 * 4usize..].as_ptr() as *const v128) }; let v2: v128 = unsafe { v128_load(src[2 * 4usize..].as_ptr() as *const v128) }; let v3: v128 = unsafe { v128_load(src[3 * 4usize..].as_ptr() as *const v128) }; let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1); let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3); let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1); let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3); let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower); let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower); let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper); let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper); let combined_lower = self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)); let combined_upper = self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)); self.combine_f32x8(combined_lower, combined_upper) } #[inline(always)] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { let (lower, upper) = self.split_f32x16(a); let (v0_vec, v1_vec) = self.split_f32x8(lower); let (v2_vec, v3_vec) = self.split_f32x8(upper); let v0: v128 = v0_vec.into(); let v1: v128 = v1_vec.into(); let v2: v128 = v2_vec.into(); let v3: v128 = v3_vec.into(); let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2); let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3); let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2); let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3); let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower); let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower); let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper); let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper); unsafe { v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0); v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1); v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2); v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3); } } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { let (a0, a1) = self.split_f32x16(a); self.combine_u8x32(self.reinterpret_u8_f32x8(a0), self.reinterpret_u8_f32x8(a1)) } #[inline(always)] fn reinterpret_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8( self.reinterpret_u32_f32x8(a0), self.reinterpret_u32_f32x8(a1), ) } #[inline(always)] fn cvt_u32_f32x16(self, a: f32x16) -> u32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_u32x8(self.cvt_u32_f32x8(a0), self.cvt_u32_f32x8(a1)) } #[inline(always)] fn cvt_i32_f32x16(self, a: f32x16) -> i32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_i32x8(self.cvt_i32_f32x8(a0), self.cvt_i32_f32x8(a1)) } #[inline(always)] fn splat_i8x64(self, a: i8) -> i8x64 { let half = self.splat_i8x32(a); self.combine_i8x32(half, half) } #[inline(always)] fn not_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.not_i8x32(a0), self.not_i8x32(a1)) } #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.add_i8x32(a0, b0), self.add_i8x32(a1, b1)) } #[inline(always)] fn sub_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.sub_i8x32(a0, b0), self.sub_i8x32(a1, b1)) } #[inline(always)] fn mul_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.mul_i8x32(a0, b0), self.mul_i8x32(a1, b1)) } #[inline(always)] fn and_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.and_i8x32(a0, b0), self.and_i8x32(a1, b1)) } #[inline(always)] fn or_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.or_i8x32(a0, b0), self.or_i8x32(a1, b1)) } #[inline(always)] fn xor_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.xor_i8x32(a0, b0), self.xor_i8x32(a1, b1)) } #[inline(always)] fn shr_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shr_i8x32(a0, b), self.shr_i8x32(a1, b)) } #[inline(always)] fn shrv_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.shrv_i8x32(a0, b0), self.shrv_i8x32(a1, b1)) } #[inline(always)] fn shl_i8x64(self, a: i8x64, b: u32) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.shl_i8x32(a0, b), self.shl_i8x32(a1, b)) } #[inline(always)] fn simd_eq_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_eq_i8x32(a0, b0), self.simd_eq_i8x32(a1, b1)) } #[inline(always)] fn simd_lt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_lt_i8x32(a0, b0), self.simd_lt_i8x32(a1, b1)) } #[inline(always)] fn simd_le_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_le_i8x32(a0, b0), self.simd_le_i8x32(a1, b1)) } #[inline(always)] fn simd_ge_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_ge_i8x32(a0, b0), self.simd_ge_i8x32(a1, b1)) } #[inline(always)] fn simd_gt_i8x64(self, a: i8x64, b: i8x64) -> mask8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_mask8x32(self.simd_gt_i8x32(a0, b0), self.simd_gt_i8x32(a1, b1)) } #[inline(always)] fn zip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, _) = self.split_i8x64(a); let (b0, _) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a0, b0), self.zip_high_i8x32(a0, b0)) } #[inline(always)] fn zip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (_, a1) = self.split_i8x64(a); let (_, b1) = self.split_i8x64(b); self.combine_i8x32(self.zip_low_i8x32(a1, b1), self.zip_high_i8x32(a1, b1)) } #[inline(always)] fn unzip_low_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_low_i8x32(a0, a1), self.unzip_low_i8x32(b0, b1)) } #[inline(always)] fn unzip_high_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.unzip_high_i8x32(a0, a1), self.unzip_high_i8x32(b0, b1)) } #[inline(always)] fn select_i8x64(self, a: mask8x64, b: i8x64, c: i8x64) -> i8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_i8x64(b); let (c0, c1) = self.split_i8x64(c); self.combine_i8x32(self.select_i8x32(a0, b0, c0), self.select_i8x32(a1, b1, c1)) } #[inline(always)] fn min_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.min_i8x32(a0, b0), self.min_i8x32(a1, b1)) } #[inline(always)] fn max_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); self.combine_i8x32(self.max_i8x32(a0, b0), self.max_i8x32(a1, b1)) } #[inline(always)] fn split_i8x64(self, a: i8x64) -> (i8x32, i8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i8x64(self, a: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_i8x32(self.neg_i8x32(a0), self.neg_i8x32(a1)) } #[inline(always)] fn reinterpret_u8_i8x64(self, a: i8x64) -> u8x64 { let (a0, a1) = self.split_i8x64(a); self.combine_u8x32(self.reinterpret_u8_i8x32(a0), self.reinterpret_u8_i8x32(a1)) } #[inline(always)] fn reinterpret_u32_i8x64(self, a: i8x64) -> u32x16 { let (a0, a1) = self.split_i8x64(a); self.combine_u32x8( self.reinterpret_u32_i8x32(a0), self.reinterpret_u32_i8x32(a1), ) } #[inline(always)] fn splat_u8x64(self, a: u8) -> u8x64 { let half = self.splat_u8x32(a); self.combine_u8x32(half, half) } #[inline(always)] fn not_u8x64(self, a: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.not_u8x32(a0), self.not_u8x32(a1)) } #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.add_u8x32(a0, b0), self.add_u8x32(a1, b1)) } #[inline(always)] fn sub_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.sub_u8x32(a0, b0), self.sub_u8x32(a1, b1)) } #[inline(always)] fn mul_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.mul_u8x32(a0, b0), self.mul_u8x32(a1, b1)) } #[inline(always)] fn and_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.and_u8x32(a0, b0), self.and_u8x32(a1, b1)) } #[inline(always)] fn or_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.or_u8x32(a0, b0), self.or_u8x32(a1, b1)) } #[inline(always)] fn xor_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.xor_u8x32(a0, b0), self.xor_u8x32(a1, b1)) } #[inline(always)] fn shr_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shr_u8x32(a0, b), self.shr_u8x32(a1, b)) } #[inline(always)] fn shrv_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.shrv_u8x32(a0, b0), self.shrv_u8x32(a1, b1)) } #[inline(always)] fn shl_u8x64(self, a: u8x64, b: u32) -> u8x64 { let (a0, a1) = self.split_u8x64(a); self.combine_u8x32(self.shl_u8x32(a0, b), self.shl_u8x32(a1, b)) } #[inline(always)] fn simd_eq_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_eq_u8x32(a0, b0), self.simd_eq_u8x32(a1, b1)) } #[inline(always)] fn simd_lt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_lt_u8x32(a0, b0), self.simd_lt_u8x32(a1, b1)) } #[inline(always)] fn simd_le_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_le_u8x32(a0, b0), self.simd_le_u8x32(a1, b1)) } #[inline(always)] fn simd_ge_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_ge_u8x32(a0, b0), self.simd_ge_u8x32(a1, b1)) } #[inline(always)] fn simd_gt_u8x64(self, a: u8x64, b: u8x64) -> mask8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_mask8x32(self.simd_gt_u8x32(a0, b0), self.simd_gt_u8x32(a1, b1)) } #[inline(always)] fn zip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, _) = self.split_u8x64(a); let (b0, _) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a0, b0), self.zip_high_u8x32(a0, b0)) } #[inline(always)] fn zip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (_, a1) = self.split_u8x64(a); let (_, b1) = self.split_u8x64(b); self.combine_u8x32(self.zip_low_u8x32(a1, b1), self.zip_high_u8x32(a1, b1)) } #[inline(always)] fn unzip_low_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_low_u8x32(a0, a1), self.unzip_low_u8x32(b0, b1)) } #[inline(always)] fn unzip_high_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.unzip_high_u8x32(a0, a1), self.unzip_high_u8x32(b0, b1)) } #[inline(always)] fn select_u8x64(self, a: mask8x64, b: u8x64, c: u8x64) -> u8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_u8x64(b); let (c0, c1) = self.split_u8x64(c); self.combine_u8x32(self.select_u8x32(a0, b0, c0), self.select_u8x32(a1, b1, c1)) } #[inline(always)] fn min_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.min_u8x32(a0, b0), self.min_u8x32(a1, b1)) } #[inline(always)] fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); self.combine_u8x32(self.max_u8x32(a0, b0), self.max_u8x32(a1, b1)) } #[inline(always)] fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { let v0: v128 = unsafe { v128_load(src[0 * 16usize..].as_ptr() as *const v128) }; let v1: v128 = unsafe { v128_load(src[1 * 16usize..].as_ptr() as *const v128) }; let v2: v128 = unsafe { v128_load(src[2 * 16usize..].as_ptr() as *const v128) }; let v3: v128 = unsafe { v128_load(src[3 * 16usize..].as_ptr() as *const v128) }; let v01_lower = u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v0, v1); let v23_lower = u8x16_shuffle::<0, 4, 8, 12, 16, 20, 24, 28, 1, 5, 9, 13, 17, 21, 25, 29>(v2, v3); let v01_upper = u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v0, v1); let v23_upper = u8x16_shuffle::<2, 6, 10, 14, 18, 22, 26, 30, 3, 7, 11, 15, 19, 23, 27, 31>(v2, v3); let out0 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>( v01_lower, v23_lower, ); let out1 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>( v01_lower, v23_lower, ); let out2 = u8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>( v01_upper, v23_upper, ); let out3 = u8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31>( v01_upper, v23_upper, ); let combined_lower = self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)); let combined_upper = self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)); self.combine_u8x32(combined_lower, combined_upper) } #[inline(always)] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { let (lower, upper) = self.split_u8x64(a); let (v0_vec, v1_vec) = self.split_u8x32(lower); let (v2_vec, v3_vec) = self.split_u8x32(upper); let v0: v128 = v0_vec.into(); let v1: v128 = v1_vec.into(); let v2: v128 = v2_vec.into(); let v3: v128 = v3_vec.into(); let v02_lower = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v0, v2); let v13_lower = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(v1, v3); let v02_upper = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v0, v2); let v13_upper = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(v1, v3); let out0 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( v02_lower, v13_lower, ); let out1 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( v02_lower, v13_lower, ); let out2 = u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>( v02_upper, v13_upper, ); let out3 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( v02_upper, v13_upper, ); unsafe { v128_store(dest[0 * 16usize..].as_mut_ptr() as *mut v128, out0); v128_store(dest[1 * 16usize..].as_mut_ptr() as *mut v128, out1); v128_store(dest[2 * 16usize..].as_mut_ptr() as *mut v128, out2); v128_store(dest[3 * 16usize..].as_mut_ptr() as *mut v128, out3); } } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { let (a0, a1) = self.split_u8x64(a); self.combine_u32x8( self.reinterpret_u32_u8x32(a0), self.reinterpret_u32_u8x32(a1), ) } #[inline(always)] fn splat_mask8x64(self, a: i8) -> mask8x64 { let half = self.splat_mask8x32(a); self.combine_mask8x32(half, half) } #[inline(always)] fn not_mask8x64(self, a: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); self.combine_mask8x32(self.not_mask8x32(a0), self.not_mask8x32(a1)) } #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.and_mask8x32(a0, b0), self.and_mask8x32(a1, b1)) } #[inline(always)] fn or_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.or_mask8x32(a0, b0), self.or_mask8x32(a1, b1)) } #[inline(always)] fn xor_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.xor_mask8x32(a0, b0), self.xor_mask8x32(a1, b1)) } #[inline(always)] fn select_mask8x64( self, a: mask8x64, b: mask8x64, c: mask8x64, ) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); let (c0, c1) = self.split_mask8x64(c); self.combine_mask8x32( self.select_mask8x32(a0, b0, c0), self.select_mask8x32(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); self.combine_mask8x32(self.simd_eq_mask8x32(a0, b0), self.simd_eq_mask8x32(a1, b1)) } #[inline(always)] fn split_mask8x64(self, a: mask8x64) -> (mask8x32, mask8x32) { let mut b0 = [0; 32usize]; let mut b1 = [0; 32usize]; b0.copy_from_slice(&a.val[0..32usize]); b1.copy_from_slice(&a.val[32usize..64usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i16x32(self, a: i16) -> i16x32 { let half = self.splat_i16x16(a); self.combine_i16x16(half, half) } #[inline(always)] fn not_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.not_i16x16(a0), self.not_i16x16(a1)) } #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.add_i16x16(a0, b0), self.add_i16x16(a1, b1)) } #[inline(always)] fn sub_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.sub_i16x16(a0, b0), self.sub_i16x16(a1, b1)) } #[inline(always)] fn mul_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.mul_i16x16(a0, b0), self.mul_i16x16(a1, b1)) } #[inline(always)] fn and_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.and_i16x16(a0, b0), self.and_i16x16(a1, b1)) } #[inline(always)] fn or_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.or_i16x16(a0, b0), self.or_i16x16(a1, b1)) } #[inline(always)] fn xor_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.xor_i16x16(a0, b0), self.xor_i16x16(a1, b1)) } #[inline(always)] fn shr_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shr_i16x16(a0, b), self.shr_i16x16(a1, b)) } #[inline(always)] fn shrv_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.shrv_i16x16(a0, b0), self.shrv_i16x16(a1, b1)) } #[inline(always)] fn shl_i16x32(self, a: i16x32, b: u32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.shl_i16x16(a0, b), self.shl_i16x16(a1, b)) } #[inline(always)] fn simd_eq_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_eq_i16x16(a0, b0), self.simd_eq_i16x16(a1, b1)) } #[inline(always)] fn simd_lt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_lt_i16x16(a0, b0), self.simd_lt_i16x16(a1, b1)) } #[inline(always)] fn simd_le_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_le_i16x16(a0, b0), self.simd_le_i16x16(a1, b1)) } #[inline(always)] fn simd_ge_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_ge_i16x16(a0, b0), self.simd_ge_i16x16(a1, b1)) } #[inline(always)] fn simd_gt_i16x32(self, a: i16x32, b: i16x32) -> mask16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_mask16x16(self.simd_gt_i16x16(a0, b0), self.simd_gt_i16x16(a1, b1)) } #[inline(always)] fn zip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, _) = self.split_i16x32(a); let (b0, _) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a0, b0), self.zip_high_i16x16(a0, b0)) } #[inline(always)] fn zip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (_, a1) = self.split_i16x32(a); let (_, b1) = self.split_i16x32(b); self.combine_i16x16(self.zip_low_i16x16(a1, b1), self.zip_high_i16x16(a1, b1)) } #[inline(always)] fn unzip_low_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.unzip_low_i16x16(a0, a1), self.unzip_low_i16x16(b0, b1)) } #[inline(always)] fn unzip_high_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16( self.unzip_high_i16x16(a0, a1), self.unzip_high_i16x16(b0, b1), ) } #[inline(always)] fn select_i16x32(self, a: mask16x32, b: i16x32, c: i16x32) -> i16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_i16x32(b); let (c0, c1) = self.split_i16x32(c); self.combine_i16x16( self.select_i16x16(a0, b0, c0), self.select_i16x16(a1, b1, c1), ) } #[inline(always)] fn min_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.min_i16x16(a0, b0), self.min_i16x16(a1, b1)) } #[inline(always)] fn max_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); self.combine_i16x16(self.max_i16x16(a0, b0), self.max_i16x16(a1, b1)) } #[inline(always)] fn split_i16x32(self, a: i16x32) -> (i16x16, i16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i16x32(self, a: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); self.combine_i16x16(self.neg_i16x16(a0), self.neg_i16x16(a1)) } #[inline(always)] fn reinterpret_u8_i16x32(self, a: i16x32) -> u8x64 { let (a0, a1) = self.split_i16x32(a); self.combine_u8x32( self.reinterpret_u8_i16x16(a0), self.reinterpret_u8_i16x16(a1), ) } #[inline(always)] fn reinterpret_u32_i16x32(self, a: i16x32) -> u32x16 { let (a0, a1) = self.split_i16x32(a); self.combine_u32x8( self.reinterpret_u32_i16x16(a0), self.reinterpret_u32_i16x16(a1), ) } #[inline(always)] fn splat_u16x32(self, a: u16) -> u16x32 { let half = self.splat_u16x16(a); self.combine_u16x16(half, half) } #[inline(always)] fn not_u16x32(self, a: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.not_u16x16(a0), self.not_u16x16(a1)) } #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.add_u16x16(a0, b0), self.add_u16x16(a1, b1)) } #[inline(always)] fn sub_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.sub_u16x16(a0, b0), self.sub_u16x16(a1, b1)) } #[inline(always)] fn mul_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.mul_u16x16(a0, b0), self.mul_u16x16(a1, b1)) } #[inline(always)] fn and_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.and_u16x16(a0, b0), self.and_u16x16(a1, b1)) } #[inline(always)] fn or_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.or_u16x16(a0, b0), self.or_u16x16(a1, b1)) } #[inline(always)] fn xor_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.xor_u16x16(a0, b0), self.xor_u16x16(a1, b1)) } #[inline(always)] fn shr_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shr_u16x16(a0, b), self.shr_u16x16(a1, b)) } #[inline(always)] fn shrv_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.shrv_u16x16(a0, b0), self.shrv_u16x16(a1, b1)) } #[inline(always)] fn shl_u16x32(self, a: u16x32, b: u32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u16x16(self.shl_u16x16(a0, b), self.shl_u16x16(a1, b)) } #[inline(always)] fn simd_eq_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_eq_u16x16(a0, b0), self.simd_eq_u16x16(a1, b1)) } #[inline(always)] fn simd_lt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_lt_u16x16(a0, b0), self.simd_lt_u16x16(a1, b1)) } #[inline(always)] fn simd_le_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_le_u16x16(a0, b0), self.simd_le_u16x16(a1, b1)) } #[inline(always)] fn simd_ge_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_ge_u16x16(a0, b0), self.simd_ge_u16x16(a1, b1)) } #[inline(always)] fn simd_gt_u16x32(self, a: u16x32, b: u16x32) -> mask16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_mask16x16(self.simd_gt_u16x16(a0, b0), self.simd_gt_u16x16(a1, b1)) } #[inline(always)] fn zip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, _) = self.split_u16x32(a); let (b0, _) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a0, b0), self.zip_high_u16x16(a0, b0)) } #[inline(always)] fn zip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (_, a1) = self.split_u16x32(a); let (_, b1) = self.split_u16x32(b); self.combine_u16x16(self.zip_low_u16x16(a1, b1), self.zip_high_u16x16(a1, b1)) } #[inline(always)] fn unzip_low_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.unzip_low_u16x16(a0, a1), self.unzip_low_u16x16(b0, b1)) } #[inline(always)] fn unzip_high_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16( self.unzip_high_u16x16(a0, a1), self.unzip_high_u16x16(b0, b1), ) } #[inline(always)] fn select_u16x32(self, a: mask16x32, b: u16x32, c: u16x32) -> u16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_u16x32(b); let (c0, c1) = self.split_u16x32(c); self.combine_u16x16( self.select_u16x16(a0, b0, c0), self.select_u16x16(a1, b1, c1), ) } #[inline(always)] fn min_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.min_u16x16(a0, b0), self.min_u16x16(a1, b1)) } #[inline(always)] fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); self.combine_u16x16(self.max_u16x16(a0, b0), self.max_u16x16(a1, b1)) } #[inline(always)] fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { let v0: v128 = unsafe { v128_load(src[0 * 8usize..].as_ptr() as *const v128) }; let v1: v128 = unsafe { v128_load(src[1 * 8usize..].as_ptr() as *const v128) }; let v2: v128 = unsafe { v128_load(src[2 * 8usize..].as_ptr() as *const v128) }; let v3: v128 = unsafe { v128_load(src[3 * 8usize..].as_ptr() as *const v128) }; let v01_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v0, v1); let v23_lower = u16x8_shuffle::<0, 4, 8, 12, 1, 5, 9, 13>(v2, v3); let v01_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v0, v1); let v23_upper = u16x8_shuffle::<2, 6, 10, 14, 3, 7, 11, 15>(v2, v3); let out0 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_lower, v23_lower); let out1 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_lower, v23_lower); let out2 = u16x8_shuffle::<0, 1, 2, 3, 8, 9, 10, 11>(v01_upper, v23_upper); let out3 = u16x8_shuffle::<4, 5, 6, 7, 12, 13, 14, 15>(v01_upper, v23_upper); let combined_lower = self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)); let combined_upper = self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)); self.combine_u16x16(combined_lower, combined_upper) } #[inline(always)] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { let (lower, upper) = self.split_u16x32(a); let (v0_vec, v1_vec) = self.split_u16x16(lower); let (v2_vec, v3_vec) = self.split_u16x16(upper); let v0: v128 = v0_vec.into(); let v1: v128 = v1_vec.into(); let v2: v128 = v2_vec.into(); let v3: v128 = v3_vec.into(); let v02_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v0, v2); let v13_lower = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v1, v3); let v02_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v0, v2); let v13_upper = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v1, v3); let out0 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_lower, v13_lower); let out1 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_lower, v13_lower); let out2 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_upper, v13_upper); let out3 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_upper, v13_upper); unsafe { v128_store(dest[0 * 8usize..].as_mut_ptr() as *mut v128, out0); v128_store(dest[1 * 8usize..].as_mut_ptr() as *mut v128, out1); v128_store(dest[2 * 8usize..].as_mut_ptr() as *mut v128, out2); v128_store(dest[3 * 8usize..].as_mut_ptr() as *mut v128, out3); } } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x16(self.narrow_u16x16(a0), self.narrow_u16x16(a1)) } #[inline(always)] fn reinterpret_u8_u16x32(self, a: u16x32) -> u8x64 { let (a0, a1) = self.split_u16x32(a); self.combine_u8x32( self.reinterpret_u8_u16x16(a0), self.reinterpret_u8_u16x16(a1), ) } #[inline(always)] fn reinterpret_u32_u16x32(self, a: u16x32) -> u32x16 { let (a0, a1) = self.split_u16x32(a); self.combine_u32x8( self.reinterpret_u32_u16x16(a0), self.reinterpret_u32_u16x16(a1), ) } #[inline(always)] fn splat_mask16x32(self, a: i16) -> mask16x32 { let half = self.splat_mask16x16(a); self.combine_mask16x16(half, half) } #[inline(always)] fn not_mask16x32(self, a: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); self.combine_mask16x16(self.not_mask16x16(a0), self.not_mask16x16(a1)) } #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.and_mask16x16(a0, b0), self.and_mask16x16(a1, b1)) } #[inline(always)] fn or_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.or_mask16x16(a0, b0), self.or_mask16x16(a1, b1)) } #[inline(always)] fn xor_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16(self.xor_mask16x16(a0, b0), self.xor_mask16x16(a1, b1)) } #[inline(always)] fn select_mask16x32( self, a: mask16x32, b: mask16x32, c: mask16x32, ) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); let (c0, c1) = self.split_mask16x32(c); self.combine_mask16x16( self.select_mask16x16(a0, b0, c0), self.select_mask16x16(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); self.combine_mask16x16( self.simd_eq_mask16x16(a0, b0), self.simd_eq_mask16x16(a1, b1), ) } #[inline(always)] fn split_mask16x32(self, a: mask16x32) -> (mask16x16, mask16x16) { let mut b0 = [0; 16usize]; let mut b1 = [0; 16usize]; b0.copy_from_slice(&a.val[0..16usize]); b1.copy_from_slice(&a.val[16usize..32usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_i32x16(self, a: i32) -> i32x16 { let half = self.splat_i32x8(a); self.combine_i32x8(half, half) } #[inline(always)] fn not_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.not_i32x8(a0), self.not_i32x8(a1)) } #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.add_i32x8(a0, b0), self.add_i32x8(a1, b1)) } #[inline(always)] fn sub_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.sub_i32x8(a0, b0), self.sub_i32x8(a1, b1)) } #[inline(always)] fn mul_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.mul_i32x8(a0, b0), self.mul_i32x8(a1, b1)) } #[inline(always)] fn and_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.and_i32x8(a0, b0), self.and_i32x8(a1, b1)) } #[inline(always)] fn or_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.or_i32x8(a0, b0), self.or_i32x8(a1, b1)) } #[inline(always)] fn xor_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.xor_i32x8(a0, b0), self.xor_i32x8(a1, b1)) } #[inline(always)] fn shr_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shr_i32x8(a0, b), self.shr_i32x8(a1, b)) } #[inline(always)] fn shrv_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.shrv_i32x8(a0, b0), self.shrv_i32x8(a1, b1)) } #[inline(always)] fn shl_i32x16(self, a: i32x16, b: u32) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.shl_i32x8(a0, b), self.shl_i32x8(a1, b)) } #[inline(always)] fn simd_eq_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_eq_i32x8(a0, b0), self.simd_eq_i32x8(a1, b1)) } #[inline(always)] fn simd_lt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_lt_i32x8(a0, b0), self.simd_lt_i32x8(a1, b1)) } #[inline(always)] fn simd_le_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_le_i32x8(a0, b0), self.simd_le_i32x8(a1, b1)) } #[inline(always)] fn simd_ge_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_ge_i32x8(a0, b0), self.simd_ge_i32x8(a1, b1)) } #[inline(always)] fn simd_gt_i32x16(self, a: i32x16, b: i32x16) -> mask32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_mask32x8(self.simd_gt_i32x8(a0, b0), self.simd_gt_i32x8(a1, b1)) } #[inline(always)] fn zip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, _) = self.split_i32x16(a); let (b0, _) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a0, b0), self.zip_high_i32x8(a0, b0)) } #[inline(always)] fn zip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (_, a1) = self.split_i32x16(a); let (_, b1) = self.split_i32x16(b); self.combine_i32x8(self.zip_low_i32x8(a1, b1), self.zip_high_i32x8(a1, b1)) } #[inline(always)] fn unzip_low_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_low_i32x8(a0, a1), self.unzip_low_i32x8(b0, b1)) } #[inline(always)] fn unzip_high_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.unzip_high_i32x8(a0, a1), self.unzip_high_i32x8(b0, b1)) } #[inline(always)] fn select_i32x16(self, a: mask32x16, b: i32x16, c: i32x16) -> i32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_i32x16(b); let (c0, c1) = self.split_i32x16(c); self.combine_i32x8(self.select_i32x8(a0, b0, c0), self.select_i32x8(a1, b1, c1)) } #[inline(always)] fn min_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.min_i32x8(a0, b0), self.min_i32x8(a1, b1)) } #[inline(always)] fn max_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); self.combine_i32x8(self.max_i32x8(a0, b0), self.max_i32x8(a1, b1)) } #[inline(always)] fn split_i32x16(self, a: i32x16) -> (i32x8, i32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn neg_i32x16(self, a: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_i32x8(self.neg_i32x8(a0), self.neg_i32x8(a1)) } #[inline(always)] fn reinterpret_u8_i32x16(self, a: i32x16) -> u8x64 { let (a0, a1) = self.split_i32x16(a); self.combine_u8x32(self.reinterpret_u8_i32x8(a0), self.reinterpret_u8_i32x8(a1)) } #[inline(always)] fn reinterpret_u32_i32x16(self, a: i32x16) -> u32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_u32x8( self.reinterpret_u32_i32x8(a0), self.reinterpret_u32_i32x8(a1), ) } #[inline(always)] fn cvt_f32_i32x16(self, a: i32x16) -> f32x16 { let (a0, a1) = self.split_i32x16(a); self.combine_f32x8(self.cvt_f32_i32x8(a0), self.cvt_f32_i32x8(a1)) } #[inline(always)] fn splat_u32x16(self, a: u32) -> u32x16 { let half = self.splat_u32x8(a); self.combine_u32x8(half, half) } #[inline(always)] fn not_u32x16(self, a: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.not_u32x8(a0), self.not_u32x8(a1)) } #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.add_u32x8(a0, b0), self.add_u32x8(a1, b1)) } #[inline(always)] fn sub_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.sub_u32x8(a0, b0), self.sub_u32x8(a1, b1)) } #[inline(always)] fn mul_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.mul_u32x8(a0, b0), self.mul_u32x8(a1, b1)) } #[inline(always)] fn and_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.and_u32x8(a0, b0), self.and_u32x8(a1, b1)) } #[inline(always)] fn or_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.or_u32x8(a0, b0), self.or_u32x8(a1, b1)) } #[inline(always)] fn xor_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.xor_u32x8(a0, b0), self.xor_u32x8(a1, b1)) } #[inline(always)] fn shr_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shr_u32x8(a0, b), self.shr_u32x8(a1, b)) } #[inline(always)] fn shrv_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.shrv_u32x8(a0, b0), self.shrv_u32x8(a1, b1)) } #[inline(always)] fn shl_u32x16(self, a: u32x16, b: u32) -> u32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_u32x8(self.shl_u32x8(a0, b), self.shl_u32x8(a1, b)) } #[inline(always)] fn simd_eq_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_eq_u32x8(a0, b0), self.simd_eq_u32x8(a1, b1)) } #[inline(always)] fn simd_lt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_lt_u32x8(a0, b0), self.simd_lt_u32x8(a1, b1)) } #[inline(always)] fn simd_le_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_le_u32x8(a0, b0), self.simd_le_u32x8(a1, b1)) } #[inline(always)] fn simd_ge_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_ge_u32x8(a0, b0), self.simd_ge_u32x8(a1, b1)) } #[inline(always)] fn simd_gt_u32x16(self, a: u32x16, b: u32x16) -> mask32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_mask32x8(self.simd_gt_u32x8(a0, b0), self.simd_gt_u32x8(a1, b1)) } #[inline(always)] fn zip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, _) = self.split_u32x16(a); let (b0, _) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a0, b0), self.zip_high_u32x8(a0, b0)) } #[inline(always)] fn zip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (_, a1) = self.split_u32x16(a); let (_, b1) = self.split_u32x16(b); self.combine_u32x8(self.zip_low_u32x8(a1, b1), self.zip_high_u32x8(a1, b1)) } #[inline(always)] fn unzip_low_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_low_u32x8(a0, a1), self.unzip_low_u32x8(b0, b1)) } #[inline(always)] fn unzip_high_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.unzip_high_u32x8(a0, a1), self.unzip_high_u32x8(b0, b1)) } #[inline(always)] fn select_u32x16(self, a: mask32x16, b: u32x16, c: u32x16) -> u32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_u32x16(b); let (c0, c1) = self.split_u32x16(c); self.combine_u32x8(self.select_u32x8(a0, b0, c0), self.select_u32x8(a1, b1, c1)) } #[inline(always)] fn min_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.min_u32x8(a0, b0), self.min_u32x8(a1, b1)) } #[inline(always)] fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); self.combine_u32x8(self.max_u32x8(a0, b0), self.max_u32x8(a1, b1)) } #[inline(always)] fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { let v0: v128 = unsafe { v128_load(src[0 * 4usize..].as_ptr() as *const v128) }; let v1: v128 = unsafe { v128_load(src[1 * 4usize..].as_ptr() as *const v128) }; let v2: v128 = unsafe { v128_load(src[2 * 4usize..].as_ptr() as *const v128) }; let v3: v128 = unsafe { v128_load(src[3 * 4usize..].as_ptr() as *const v128) }; let v01_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v1); let v23_lower = u32x4_shuffle::<0, 4, 1, 5>(v2, v3); let v01_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v1); let v23_upper = u32x4_shuffle::<2, 6, 3, 7>(v2, v3); let out0 = u32x4_shuffle::<0, 1, 4, 5>(v01_lower, v23_lower); let out1 = u32x4_shuffle::<2, 3, 6, 7>(v01_lower, v23_lower); let out2 = u32x4_shuffle::<0, 1, 4, 5>(v01_upper, v23_upper); let out3 = u32x4_shuffle::<2, 3, 6, 7>(v01_upper, v23_upper); let combined_lower = self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)); let combined_upper = self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)); self.combine_u32x8(combined_lower, combined_upper) } #[inline(always)] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { let (lower, upper) = self.split_u32x16(a); let (v0_vec, v1_vec) = self.split_u32x8(lower); let (v2_vec, v3_vec) = self.split_u32x8(upper); let v0: v128 = v0_vec.into(); let v1: v128 = v1_vec.into(); let v2: v128 = v2_vec.into(); let v3: v128 = v3_vec.into(); let v02_lower = u32x4_shuffle::<0, 4, 1, 5>(v0, v2); let v13_lower = u32x4_shuffle::<0, 4, 1, 5>(v1, v3); let v02_upper = u32x4_shuffle::<2, 6, 3, 7>(v0, v2); let v13_upper = u32x4_shuffle::<2, 6, 3, 7>(v1, v3); let out0 = u32x4_shuffle::<0, 4, 1, 5>(v02_lower, v13_lower); let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower); let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper); let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper); unsafe { v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0); v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1); v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2); v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3); } } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { let (a0, a1) = self.split_u32x16(a); self.combine_u8x32(self.reinterpret_u8_u32x8(a0), self.reinterpret_u8_u32x8(a1)) } #[inline(always)] fn cvt_f32_u32x16(self, a: u32x16) -> f32x16 { let (a0, a1) = self.split_u32x16(a); self.combine_f32x8(self.cvt_f32_u32x8(a0), self.cvt_f32_u32x8(a1)) } #[inline(always)] fn splat_mask32x16(self, a: i32) -> mask32x16 { let half = self.splat_mask32x8(a); self.combine_mask32x8(half, half) } #[inline(always)] fn not_mask32x16(self, a: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); self.combine_mask32x8(self.not_mask32x8(a0), self.not_mask32x8(a1)) } #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.and_mask32x8(a0, b0), self.and_mask32x8(a1, b1)) } #[inline(always)] fn or_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.or_mask32x8(a0, b0), self.or_mask32x8(a1, b1)) } #[inline(always)] fn xor_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.xor_mask32x8(a0, b0), self.xor_mask32x8(a1, b1)) } #[inline(always)] fn select_mask32x16( self, a: mask32x16, b: mask32x16, c: mask32x16, ) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); let (c0, c1) = self.split_mask32x16(c); self.combine_mask32x8( self.select_mask32x8(a0, b0, c0), self.select_mask32x8(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); self.combine_mask32x8(self.simd_eq_mask32x8(a0, b0), self.simd_eq_mask32x8(a1, b1)) } #[inline(always)] fn split_mask32x16(self, a: mask32x16) -> (mask32x8, mask32x8) { let mut b0 = [0; 8usize]; let mut b1 = [0; 8usize]; b0.copy_from_slice(&a.val[0..8usize]); b1.copy_from_slice(&a.val[8usize..16usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn splat_f64x8(self, a: f64) -> f64x8 { let half = self.splat_f64x4(a); self.combine_f64x4(half, half) } #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) } #[inline(always)] fn neg_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.neg_f64x4(a0), self.neg_f64x4(a1)) } #[inline(always)] fn sqrt_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.sqrt_f64x4(a0), self.sqrt_f64x4(a1)) } #[inline(always)] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.add_f64x4(a0, b0), self.add_f64x4(a1, b1)) } #[inline(always)] fn sub_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.sub_f64x4(a0, b0), self.sub_f64x4(a1, b1)) } #[inline(always)] fn mul_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.mul_f64x4(a0, b0), self.mul_f64x4(a1, b1)) } #[inline(always)] fn div_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.div_f64x4(a0, b0), self.div_f64x4(a1, b1)) } #[inline(always)] fn copysign_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.copysign_f64x4(a0, b0), self.copysign_f64x4(a1, b1)) } #[inline(always)] fn simd_eq_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_eq_f64x4(a0, b0), self.simd_eq_f64x4(a1, b1)) } #[inline(always)] fn simd_lt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_lt_f64x4(a0, b0), self.simd_lt_f64x4(a1, b1)) } #[inline(always)] fn simd_le_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_le_f64x4(a0, b0), self.simd_le_f64x4(a1, b1)) } #[inline(always)] fn simd_ge_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_ge_f64x4(a0, b0), self.simd_ge_f64x4(a1, b1)) } #[inline(always)] fn simd_gt_f64x8(self, a: f64x8, b: f64x8) -> mask64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_mask64x4(self.simd_gt_f64x4(a0, b0), self.simd_gt_f64x4(a1, b1)) } #[inline(always)] fn zip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, _) = self.split_f64x8(a); let (b0, _) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a0, b0), self.zip_high_f64x4(a0, b0)) } #[inline(always)] fn zip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (_, a1) = self.split_f64x8(a); let (_, b1) = self.split_f64x8(b); self.combine_f64x4(self.zip_low_f64x4(a1, b1), self.zip_high_f64x4(a1, b1)) } #[inline(always)] fn unzip_low_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_low_f64x4(a0, a1), self.unzip_low_f64x4(b0, b1)) } #[inline(always)] fn unzip_high_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.unzip_high_f64x4(a0, a1), self.unzip_high_f64x4(b0, b1)) } #[inline(always)] fn max_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.max_f64x4(a0, b0), self.max_f64x4(a1, b1)) } #[inline(always)] fn max_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.max_precise_f64x4(a0, b0), self.max_precise_f64x4(a1, b1), ) } #[inline(always)] fn min_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4(self.min_f64x4(a0, b0), self.min_f64x4(a1, b1)) } #[inline(always)] fn min_precise_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); self.combine_f64x4( self.min_precise_f64x4(a0, b0), self.min_precise_f64x4(a1, b1), ) } #[inline(always)] fn madd_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.madd_f64x4(a0, b0, c0), self.madd_f64x4(a1, b1, c1)) } #[inline(always)] fn msub_f64x8(self, a: f64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.msub_f64x4(a0, b0, c0), self.msub_f64x4(a1, b1, c1)) } #[inline(always)] fn floor_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1)) } #[inline(always)] fn fract_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.fract_f64x4(a0), self.fract_f64x4(a1)) } #[inline(always)] fn trunc_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.trunc_f64x4(a0), self.trunc_f64x4(a1)) } #[inline(always)] fn select_f64x8(self, a: mask64x8, b: f64x8, c: f64x8) -> f64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_f64x8(b); let (c0, c1) = self.split_f64x8(c); self.combine_f64x4(self.select_f64x4(a0, b0, c0), self.select_f64x4(a1, b1, c1)) } #[inline(always)] fn split_f64x8(self, a: f64x8) -> (f64x4, f64x4) { let mut b0 = [0.0; 4usize]; let mut b1 = [0.0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } #[inline(always)] fn reinterpret_f32_f64x8(self, a: f64x8) -> f32x16 { let (a0, a1) = self.split_f64x8(a); self.combine_f32x8( self.reinterpret_f32_f64x4(a0), self.reinterpret_f32_f64x4(a1), ) } #[inline(always)] fn splat_mask64x8(self, a: i64) -> mask64x8 { let half = self.splat_mask64x4(a); self.combine_mask64x4(half, half) } #[inline(always)] fn not_mask64x8(self, a: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); self.combine_mask64x4(self.not_mask64x4(a0), self.not_mask64x4(a1)) } #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.and_mask64x4(a0, b0), self.and_mask64x4(a1, b1)) } #[inline(always)] fn or_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.or_mask64x4(a0, b0), self.or_mask64x4(a1, b1)) } #[inline(always)] fn xor_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.xor_mask64x4(a0, b0), self.xor_mask64x4(a1, b1)) } #[inline(always)] fn select_mask64x8( self, a: mask64x8, b: mask64x8, c: mask64x8, ) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); let (c0, c1) = self.split_mask64x8(c); self.combine_mask64x4( self.select_mask64x4(a0, b0, c0), self.select_mask64x4(a1, b1, c1), ) } #[inline(always)] fn simd_eq_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); self.combine_mask64x4(self.simd_eq_mask64x4(a0, b0), self.simd_eq_mask64x4(a1, b1)) } #[inline(always)] fn split_mask64x8(self, a: mask64x8) -> (mask64x4, mask64x4) { let mut b0 = [0; 4usize]; let mut b1 = [0; 4usize]; b0.copy_from_slice(&a.val[0..4usize]); b1.copy_from_slice(&a.val[4usize..8usize]); (b0.simd_into(self), b1.simd_into(self)) } } impl SimdFrom for f32x4 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: f32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i8x16 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: i8x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u8x16 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: u8x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask8x16 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: mask8x16) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i16x8 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: i16x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u16x8 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: u16x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask16x8 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: mask16x8) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for i32x4 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: i32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for u32x4 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: u32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask32x4 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: mask32x4) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for f64x2 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: f64x2) -> Self { unsafe { core::mem::transmute(value.val) } } } impl SimdFrom for mask64x2 { #[inline(always)] fn simd_from(arch: v128, simd: S) -> Self { Self { val: unsafe { core::mem::transmute(arch) }, simd, } } } impl From> for v128 { #[inline(always)] fn from(value: mask64x2) -> Self { unsafe { core::mem::transmute(value.val) } } } fearless_simd-0.3.0/src/generated.rs000064400000000000000000000042441046102023000155430ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT #![expect( missing_docs, clippy::cast_possible_truncation, clippy::unseparated_literal_suffix, trivial_numeric_casts, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] #![cfg_attr( target_arch = "x86_64", expect( clippy::should_implement_trait, clippy::missing_transmute_annotations, clippy::useless_transmute, clippy::new_without_default, clippy::unnecessary_cast, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" ) )] #![cfg_attr( target_arch = "wasm32", expect( clippy::should_implement_trait, clippy::missing_transmute_annotations, clippy::useless_transmute, clippy::new_without_default, clippy::unnecessary_cast, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" ) )] #![cfg_attr( all( feature = "std", all(not(target_arch = "x86_64"), not(target_arch = "wasm32")) ), expect( clippy::missing_safety_doc, clippy::should_implement_trait, clippy::missing_transmute_annotations, clippy::useless_transmute, clippy::new_without_default, clippy::unnecessary_cast, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" ) )] //! A module containing generated files //! //! All files in this subdirectory are autogenerated by the `fearless_simd_gen` crate. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod avx2; mod fallback; #[cfg(target_arch = "aarch64")] mod neon; mod ops; mod simd_trait; mod simd_types; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] mod sse4_2; #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] mod wasm; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub use avx2::*; pub use fallback::*; #[cfg(target_arch = "aarch64")] pub use neon::*; pub use simd_trait::*; pub use simd_types::*; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub use sse4_2::*; #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] pub use wasm::*; fearless_simd-0.3.0/src/impl_macros.rs000064400000000000000000000017371046102023000161160ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Macros used by implementations #![allow( unused_macros, unused_imports, reason = "Not all macros will be used by all implementations" )] // Adapted from similar macro in pulp macro_rules! delegate { ( $prefix:path : $( $(#[$attr: meta])* $(unsafe $($placeholder: lifetime)?)? fn $func: ident $(<$(const $generic: ident: $generic_ty: ty),* $(,)?>)?( $($arg: ident: $ty: ty),* $(,)? ) $(-> $ret: ty)?; )*) => { $( #[doc=concat!("See [`", stringify!($prefix), "::", stringify!($func), "`].")] $(#[$attr])* #[inline(always)] pub $(unsafe $($placeholder)?)? fn $func $(<$(const $generic: $generic_ty),*>)?(self, $($arg: $ty),*) $(-> $ret)? { unsafe { $func $(::<$($generic,)*>)?($($arg,)*) } } )* }; } pub(crate) use delegate; fearless_simd-0.3.0/src/lib.rs000064400000000000000000000406551046102023000143610ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! A helper library to make SIMD more friendly. //! //! Fearless SIMD exposes safe SIMD with ergonomic multi-versioning in Rust. //! //! Fearless SIMD uses "marker values" which serve as proofs of which target features are available on the current CPU. //! These each implement the [`Simd`] trait, which exposes a core set of SIMD operations which are implemented as //! efficiently as possible on each target platform. //! //! Additionally, there are types for packed vectors of a specific width and element type (such as [`f32x4`]). //! Fearless SIMD does not currently support vectors of less than 128 bits. //! These vector types implement some standard arithmetic traits (i.e. they can be added together using //! `+`, multiplied by a scalar using `*`, among others), which are implemented as efficiently //! as possible using SIMD instructions. //! These can be created in a SIMD context using the [`SimdFrom`] trait, or the //! [`from_slice`][SimdBase::from_slice] associated function. //! //! To call a function with the best available target features and get the associated `Simd` //! implementation, use the [`dispatch!()`] macro: //! //! ```rust //! use fearless_simd::{Level, Simd, dispatch}; //! //! #[inline(always)] //! fn sigmoid(simd: S, x: &[f32], out: &mut [f32]) { /* ... */ } //! //! // The stored level, which you should only construct once in your application. //! let level = Level::new(); //! //! dispatch!(level, simd => sigmoid(simd, &[/*...*/], &mut [/*...*/])); //! ``` //! //! A few things to note: //! //! 1) `sigmoid` is generic over any `Simd` type. //! 2) The [`dispatch`] macro is used to invoke the given function with the target features associated with the supplied [`Level`]. //! 3) The function or closure passed to [`dispatch!()`] should be `#[inline(always)]`. //! The performance of the SIMD implementation may be poor if that isn't the case. See [the section on inlining for details](#inlining) //! //! The first parameter to [`dispatch!()`] is the [`Level`]. //! If you are writing an application, you should create this once (using [`Level::new`]), and pass it to any function which wants to use SIMD. //! This type stores which instruction sets are available for the current process, which is used //! in the macro to dispatch to the most optimal variant of the supplied function for this process. //! //! # Inlining //! //! Fearless SIMD relies heavily on Rust's inlining support to create functions which have the //! given target features enabled. //! As such, most functions which you write when using Fearless SIMD should have the `#[inline(always)]` attribute. //! //! //! //! # Webassembly //! //! WASM SIMD doesn't have feature detection, and so you need to compile two versions of your bundle for WASM, one with SIMD and one without, //! then select the appropriate one for your user's browser. //! TODO: Expand on this. //! //! ## Credits //! //! This crate was inspired by [`pulp`], [`std::simd`], among others in the Rust ecosystem, though makes many decisions differently. //! It benefited from conversations with Luca Versari, though he is not responsible for any of the mistakes or bad decisions. //! //! # Feature Flags //! //! The following crate [feature flags](https://doc.rust-lang.org/cargo/reference/features.html#dependency-features) are available: //! //! - `std` (enabled by default): Get floating point functions from the standard library (likely using your target's libc). //! Also allows using [`Level::new`] on all platforms, to detect which target features are enabled. //! - `libm`: Use floating point implementations from [libm]. //! - `safe_wrappers`: Include safe wrappers for (some) target feature specific intrinsics, //! beyond the basic SIMD operations abstracted on all platforms. //! //! At least one of `std` and `libm` is required; `std` overrides `libm`. //! //! [`pulp`]: https://crates.io/crates/pulp // LINEBENDER LINT SET - lib.rs - v3 // See https://linebender.org/wiki/canonical-lints/ // These lints shouldn't apply to examples or tests. #![cfg_attr(not(test), warn(unused_crate_dependencies))] // These lints shouldn't apply to examples. #![warn(clippy::print_stdout, clippy::print_stderr)] // Targeting e.g. 32-bit means structs containing usize can give false positives for 64-bit. #![cfg_attr(target_pointer_width = "64", warn(clippy::trivially_copy_pass_by_ref))] // END LINEBENDER LINT SET #![cfg_attr(docsrs, feature(doc_cfg))] #![allow(non_camel_case_types, reason = "TODO")] #![expect(clippy::unused_unit, reason = "easier for code generation")] #![expect( clippy::new_without_default, clippy::use_self, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] #![no_std] #[cfg(feature = "std")] extern crate std; #[cfg(all(not(feature = "libm"), not(feature = "std")))] compile_error!("fearless_simd requires either the `std` or `libm` feature"); // Suppress the unused_crate_dependencies lint when both std and libm are specified. #[cfg(all(feature = "std", feature = "libm"))] use libm as _; pub mod core_arch; mod impl_macros; mod generated; mod macros; mod traits; pub use generated::*; pub use traits::*; /// Implementations of [`Simd`] for 64 bit ARM. #[cfg(target_arch = "aarch64")] pub mod aarch64 { pub use crate::generated::Neon; } /// Implementations of [`Simd`] for webassembly. #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] pub mod wasm32 { pub use crate::generated::WasmSimd128; } /// Implementations of [`Simd`] on x86 architectures (both 32 and 64 bit). #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] pub mod x86 { pub use crate::generated::Avx2; pub use crate::generated::Sse4_2; } /// The level enum with the specific SIMD capabilities available. /// /// The contained values serve as a proof that the associated target /// feature is available. #[derive(Clone, Copy, Debug)] #[non_exhaustive] pub enum Level { /// Scalar fallback level, i.e. no supported SIMD features are to be used. /// /// This can be created with [`Level::fallback`]. // TODO: Allow not compiling this in (probably only on web, but maybe elsewhere?) Fallback(Fallback), /// The Neon instruction set on 64 bit ARM. #[cfg(target_arch = "aarch64")] Neon(Neon), /// The SIMD 128 instructions on 32-bit WebAssembly. #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] WasmSimd128(WasmSimd128), /// The SSE4.2 instruction set on (32 and 64 bit) x86. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Sse4_2(Sse4_2), /// The AVX2 and FMA instruction set on (32 and 64 bit) x86. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Avx2(Avx2), // If new variants are added, make sure to handle them in `Level::dispatch` // and `dispatch!()` } impl Level { /// Detect the available features on the current CPU, and returns the best level. /// /// If no SIMD instruction set is available, a scalar fallback will be used instead. /// /// This function requires the standard library, to use the /// [`is_x86_feature_detected`](std::arch::is_x86_feature_detected) /// or [`is_aarch64_feature_detected`](std::arch::is_aarch64_feature_detected). /// On wasm32, this requirement does not apply, so the standard library isn't required. /// /// Note that in most cases, this function should only be called by end-user applications. /// Libraries should instead accept a `Level` argument, probably as they are /// creating their data structures, then storing the level for any computations. /// Libraries which wish to abstract away SIMD usage for their common-case clients, /// should make their non-`Level` entrypoint match this function's `cfg`; to instead /// handle this at runtime, they can use [`try_detect`](Self::try_detect), /// handling the `None` case as they deem fit (probably panicking). /// This strategy avoids users of the library inadvertently using the fallback level, /// even if the requisite target features are available. /// /// If you are on an embedded device where these macros are not supported, /// you should construct the relevant variants yourself, using whatever /// way your specific chip supports accessing the current level. /// /// This value should be passed to [`dispatch!()`]. #[cfg(any(feature = "std", target_arch = "wasm32"))] #[must_use] pub fn new() -> Self { #[cfg(target_arch = "aarch64")] if std::arch::is_aarch64_feature_detected!("neon") { return unsafe { Level::Neon(Neon::new_unchecked()) }; } #[cfg(target_arch = "wasm32")] { // WASM always either has the SIMD feature compiled in or not. #[cfg(target_feature = "simd128")] return Level::WasmSimd128(WasmSimd128::new_unchecked()); #[cfg(not(target_feature = "simd128"))] return Level::fallback(); } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { if std::arch::is_x86_feature_detected!("avx2") && std::arch::is_x86_feature_detected!("fma") { return unsafe { Level::Avx2(Avx2::new_unchecked()) }; } else if std::arch::is_x86_feature_detected!("sse4.2") { return unsafe { Level::Sse4_2(Sse4_2::new_unchecked()) }; } } #[cfg(not(target_arch = "wasm32"))] Self::fallback() } /// Get the target feature level suitable for this run. /// /// Should be used in libraries if they wish to handle the case where /// target features cannot be detected at runtime. /// Most users should prefer [`new`](Self::new). /// This is discussed in more detail in `new`'s documentation. #[allow(clippy::allow_attributes, reason = "Only needed in some cfgs.")] #[allow(unreachable_code, reason = "Fallback unreachable in some cfgs.")] pub fn try_detect() -> Option { #[cfg(any(feature = "std", target_arch = "wasm32"))] return Some(Self::new()); None } /// If this is a proof that Neon (or better) is available, access that instruction set. /// /// This method should be preferred over matching against the `Neon` variant of self, /// because if Fearless SIMD gets support for an instruction set which is a superset of Neon, /// this method will return a value even if that "better" instruction set is available. /// /// This can be used in combination with the `safe_wrappers` feature to gain checked access to /// the level-specific SIMD capabilities. #[cfg(target_arch = "aarch64")] #[inline] pub fn as_neon(self) -> Option { match self { Level::Neon(neon) => Some(neon), _ => None, } } /// If this is a proof that SIMD 128 (or better) is available, access that instruction set. /// /// This method should be preferred over matching against the `WasmSimd128` variant of self, /// because if Fearless SIMD gets support for an instruction set which is a superset of SIMD 128, /// this method will return a value even if that "better" instruction set is available. /// /// This can be used in combination with the `safe_wrappers` feature to gain checked access to /// the level-specific SIMD capabilities. #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] #[inline] pub fn as_wasm_simd128(self) -> Option { match self { Level::WasmSimd128(simd128) => Some(simd128), _ => None, } } /// If this is a proof that SSE4.2 (or better) is available, access that instruction set. /// /// This method should be preferred over matching against the `Sse4_2` variant of self, /// because if Fearless SIMD gets support for an instruction set which is a superset of SSE4.2, /// this method will return a value even if that "better" instruction set is available. /// /// This can be used in combination with the `safe_wrappers` feature to gain checked access to /// the level-specific SIMD capabilities. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline] pub fn as_sse4_2(self) -> Option { match self { Level::Sse4_2(sse42) => Some(sse42), _ => None, } } /// If this is a proof that AVX2 and FMA (or better) is available, access that instruction set. /// /// This method should be preferred over matching against the `AVX2` variant of self, /// because if Fearless SIMD gets support for an instruction set which is a superset of AVX2, /// this method will return a value even if that "better" instruction set is available. /// /// This can be used in combination with the `safe_wrappers` feature to gain checked access to /// the level-specific SIMD capabilities. #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[inline] pub fn as_avx2(self) -> Option { match self { Level::Avx2(avx2) => Some(avx2), _ => None, } } /// Create a scalar fallback level, which uses no SIMD instructions. /// /// This is primarily intended for tests; most users should prefer [`Level::new`]. #[inline] pub const fn fallback() -> Self { Self::Fallback(Fallback::new()) } /// Dispatch `f` to a context where the target features which this `Level` proves are available are [enabled]. /// /// Most users of Fearless SIMD should prefer to use [`dispatch!()`] to /// explicitly vectorize a function. That has a better developer experience /// than an implementation of `WithSimd`, and is less likely to miss a vectorization /// opportunity. /// /// This has two use cases: /// 1) To call a manually written implementation of [`WithSimd`]. /// 2) To ask the compiler to auto-vectorize scalar code. /// /// For the second case to work, the provided function *must* be attributed with `#[inline(always)]`. /// Note also that any calls that function makes to other functions will likely not be auto-vectorized, /// unless they are also `#[inline(always)]`. /// /// [enabled]: https://doc.rust-lang.org/reference/attributes/codegen.html#the-target_feature-attribute #[inline] pub fn dispatch(self, f: W) -> W::Output { #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] #[inline] fn dispatch_neon(f: W, neon: Neon) -> W::Output { f.with_simd(neon) } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] #[inline] fn dispatch_simd128(f: W, simd128: WasmSimd128) -> W::Output { f.with_simd(simd128) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse4.2")] #[inline] fn dispatch_sse4_2(f: W, sse4_2: Sse4_2) -> W::Output { f.with_simd(sse4_2) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2,fma")] #[inline] fn dispatch_avx2(f: W, avx2: Avx2) -> W::Output { f.with_simd(avx2) } #[inline] fn dispatch_fallback(f: W, fallback: Fallback) -> W::Output { f.with_simd(fallback) } match self { #[cfg(target_arch = "aarch64")] Level::Neon(neon) => unsafe { dispatch_neon(f, neon) }, #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] Level::WasmSimd128(simd128) => dispatch_simd128(f, simd128), #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Level::Sse4_2(sse4_2) => unsafe { dispatch_sse4_2(f, sse4_2) }, #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] Level::Avx2(avx2) => unsafe { dispatch_avx2(f, avx2) }, Level::Fallback(fallback) => dispatch_fallback(f, fallback), } } } #[cfg(test)] mod tests { use crate::Level; const fn assert_is_send_sync() {} /// If this test compiles, we know that [`Level`] is properly `Send` and `Sync`. #[test] fn level_is_send_sync() { assert_is_send_sync::(); } } fearless_simd-0.3.0/src/macros.rs000064400000000000000000000234341046102023000150730ustar 00000000000000// Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT //! Macros publicly exported /// Defines a new function which dispatches to a SIMD-generic function, enabling the correct /// target features. /// /// The `fn` token in the definition can be prefixed with a visibility (e.g. `pub`), /// to set the visibility of the outer function. /// We recommend that the implementation function remains private, and /// should only be called through the dispatch function. /// (The exact patterns for SIMD functions using Fearleess SIMD have not /// yet been designed/enumerated). /// /// The implementation function (which is outside of this macro) *should* have the /// `#[inline(always)]` attribute. /// There are likely to be severe performance consequences if this is not the case, as /// Rust will be unable to inline SIMD intrinsics in that case. /// /// The `fn` token in the definition can be prefixed with `unsafe`, to allow an unsafe inner function. /// The safety comment added by you in the call to `simd_dispatch` the function must have /// the preconditions required to call the inner function. /// /// # Examples /// /// ```rust /// use fearless_simd::{Simd, simd_dispatch}; /// /// #[inline(always)] /// fn sigmoid_impl(simd: S, x: &[f32], out: &mut [f32]) { /* ... */ } /// /// simd_dispatch!(fn sigmoid(level, x: &[f32], out: &mut [f32]) = sigmoid_impl); /// ``` /// /// The signature of the generated function will be: /// /// ```rust /// use fearless_simd::Level; /// fn sigmoid(level: Level, x: &[f32], out: &mut [f32]) { /* ... */ } /// ``` #[macro_export] #[deprecated = "use dispatch!(level, simd => operation) instead"] macro_rules! simd_dispatch { ( $( #[$meta:meta] )* $vis:vis unsafe fn $func:ident ( level $( , $arg:ident : $ty:ty $(,)? )* ) $( -> $ret:ty )? = $inner:ident ) => { simd_dispatch!{@impl => $(#[$meta])* $vis (unsafe) fn $func (level, $(,$arg:$ty,)*) $(->$ret)? = $inner} }; ( $( #[$meta:meta] )* $vis:vis fn $func:ident ( level $( , $arg:ident : $ty:ty $(,)? )* ) $( -> $ret:ty )? = $inner:ident ) => { simd_dispatch!{@impl => $(#[$meta])* $vis () fn $func (level $(,$arg:$ty)*) $(->$ret)? = $inner} }; ( @impl => $( #[$meta:meta] )* $vis:vis ($($unsafe: ident)?) fn $func:ident ( level $( , $arg:ident : $ty:ty $(,)? )* ) $( -> $ret:ty )? = $inner:ident ) => { $( #[$meta] )* $vis $($unsafe)? fn $func(level: $crate::Level $(, $arg: $ty )*) $( -> $ret )? { #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] #[inline] $($unsafe)? fn inner_neon(neon: $crate::aarch64::Neon $( , $arg: $ty )* ) $( -> $ret )? { $($unsafe)? { $inner( neon $( , $arg )* ) } } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] #[inline] $($unsafe)? fn inner_wasm_simd128(simd128: $crate::wasm32::WasmSimd128 $( , $arg: $ty )* ) $( -> $ret )? { $($unsafe)? { $inner( simd128 $( , $arg )* ) } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse4.2")] #[inline] $($unsafe)? fn inner_sse4_2(sse4_2: $crate::x86::Sse4_2 $( , $arg: $ty )* ) $( -> $ret )? { $($unsafe)? { $inner( sse4_2 $( , $arg )* ) } } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "avx2,fma")] #[inline] $($unsafe)? fn inner_avx2(avx2: $crate::x86::Avx2 $( , $arg: $ty )* ) $( -> $ret )? { $($unsafe)? { $inner( avx2 $( , $arg )* ) } } match level { $crate::Level::Fallback(fb) => { $($unsafe)? { $inner(fb $( , $arg )* ) } }, #[cfg(target_arch = "aarch64")] $crate::Level::Neon(neon) => unsafe { inner_neon (neon $( , $arg )* ) } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] $crate::Level::WasmSimd128(wasm) => unsafe { inner_wasm_simd128 (wasm $( , $arg )* ) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] $crate::Level::Sse4_2(sse4_2) => unsafe { inner_sse4_2(sse4_2 $( , $arg)* ) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] $crate::Level::Avx2(avx2) => unsafe { inner_avx2(avx2 $( , $arg)* ) } _ => unreachable!() } } }; } /// Access the applicable [`Simd`] for a given `level`, and perform an operation using it. /// /// This macro is the root of how any explicitly written SIMD functions in this crate are /// called from a non-SIMD context. /// /// The first parameter to the macro is the [`Level`]. /// You should prefer to construct a [`Level`] once and pass it around, rather than /// frequently calling [`Level::new()`]. /// This is because `Level::new` has to detect which target features are available, which can be slow. /// /// The code of the operation will be repeated literally several times in the output, so you should prefer /// to keep this code small (as it will be type-checked, etc. for each supported SIMD level on your target). /// In most cases, it should be a single call to a function which is generic over `Simd` implementations, /// as seen in [the examples](#examples). /// For clarity, it will only be executed once per execution of `dispatch`. /// /// To guarantee target-feature-specific code generation, any functions called within the operation should /// be `#[inline(always)]`. /// /// Note that as an implementation detail of this macro, the operation will be executed inside a closure. /// This is what enables the target features to be enabled for the code inside the operation. /// A consequence of this is that early `return` and `?` will not work as expected. /// Note that in cases where you use `dispatch` to call a single function (which we expect to be the /// majority of cases), you can use `?` on the return value of dispatch instead. /// To emulate early return, you can use [`ControlFlow`](core::ops::ControlFlow) instead. /// /// # Example /// /// ``` /// use fearless_simd::{Level, Simd, dispatch}; /// /// #[inline(always)] /// fn sigmoid(simd: S, x: &[f32], out: &mut [f32]) { /* ... */ } /// /// let level = Level::new(); /// /// dispatch!(level, simd => sigmoid(simd, &[/*...*/], &mut [/*...*/])); /// ``` /// /// [`Level`]: crate::Level /// [`Level::new()`]: crate::Level::new /// [`Simd`]: crate::Simd #[macro_export] macro_rules! dispatch { ($level:expr, $simd:pat => $op:expr) => {{ /// Convert the `Simd` value into an `impl Simd`, which enforces that /// it is correctly handled. #[inline(always)] fn launder(x: S) -> impl $crate::Simd { x } match $level { $crate::Level::Fallback(fb) => { let $simd = launder(fb); // This vectorize call does nothing, but it is reasonable to be consistent here. $crate::Simd::vectorize( fb, #[inline(always)] || $op, ) } #[cfg(target_arch = "aarch64")] $crate::Level::Neon(neon) => { let $simd = launder(neon); $crate::Simd::vectorize( neon, #[inline(always)] || $op, ) } #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] $crate::Level::WasmSimd128(wasm) => { let $simd = launder(wasm); $crate::Simd::vectorize( wasm, #[inline(always)] || $op, ) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] $crate::Level::Sse4_2(sse4_2) => { let $simd = launder(sse4_2); $crate::Simd::vectorize( sse4_2, #[inline(always)] || $op, ) } #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] $crate::Level::Avx2(avx2) => { let $simd = launder(avx2); $crate::Simd::vectorize( avx2, #[inline(always)] || $op, ) } _ => unreachable!(), } }}; } #[cfg(test)] // This expect also validates that we haven't missed any levels! #[expect( unreachable_patterns, reason = "Level is non_exhaustive, but you must be exhaustive within the same crate." )] mod tests { use crate::{Level, Simd}; #[allow(dead_code, reason = "Compile test")] fn dispatch_generic() { fn generic(_: S, x: T) -> T { x } dispatch!(Level::new(), simd => generic::<_, ()>(simd, ())); } #[allow(dead_code, reason = "Compile test")] fn dispatch_value() { fn make_fn() -> impl FnOnce(S) { |_| () } dispatch!(Level::new(), simd => (make_fn())(simd)); } #[test] fn dispatch_output() { assert_eq!(42, dispatch!(Level::new(), _simd => 42)); } mod no_import_simd { /// We should be able to use [`dispatch`] in a scope which doesn't import anything. #[test] fn dispatch_with_no_imports() { let res = dispatch!(crate::Level::new(), _ => 1 + 2); assert_eq!(res, 3); } } } fearless_simd-0.3.0/src/traits.rs000064400000000000000000000053771046102023000151230ustar 00000000000000// Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT #![expect( missing_docs, reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" )] use crate::{Level, Simd}; pub trait Select { fn select(self, if_true: T, if_false: T) -> T; } // Same as pulp pub trait WithSimd { type Output; fn with_simd(self, simd: S) -> Self::Output; } impl R> WithSimd for F { type Output = R; #[inline(always)] fn with_simd(self, simd: S) -> Self::Output { self(simd.level()) } } pub trait Bytes: Sized { type Bytes; fn to_bytes(self) -> Self::Bytes; fn from_bytes(value: Self::Bytes) -> Self; fn bitcast>(self) -> U { U::from_bytes(self.to_bytes()) } } pub(crate) mod seal { #[expect(unnameable_types, reason = "TODO")] pub trait Seal {} } /// Value conversion, adding a SIMD blessing. /// /// Analogous to [`From`], but takes a SIMD token, which is used to bless /// the new value. Most such conversions are safe transmutes, but this /// trait also supports splats, and implementations can use the SIMD token /// to use an efficient splat intrinsic. /// /// The [`SimdInto`] trait is also provided for convenience. pub trait SimdFrom { fn simd_from(value: T, simd: S) -> Self; } /// Value conversion, adding a SIMD blessing. /// /// This trait is syntactic sugar for [`SimdFrom`] and exists only to allow /// `impl SimdInto` syntax in signatures, which would otherwise require /// cumbersome `where` clauses in terms of `SimdFrom`. /// /// Avoid implementing this trait directly, prefer implementing [`SimdFrom`]. pub trait SimdInto { fn simd_into(self, simd: S) -> T; } impl, S: Simd> SimdInto for F { fn simd_into(self, simd: S) -> T { SimdFrom::simd_from(self, simd) } } impl SimdFrom for T { fn simd_from(value: T, _simd: S) -> Self { value } } pub trait SimdElement { type Mask: SimdElement; } impl SimdElement for f32 { type Mask = i32; } impl SimdElement for f64 { type Mask = i64; } impl SimdElement for u8 { type Mask = i8; } impl SimdElement for i8 { type Mask = i8; } impl SimdElement for u16 { type Mask = i16; } impl SimdElement for i16 { type Mask = i16; } impl SimdElement for u32 { type Mask = i32; } impl SimdElement for i32 { type Mask = i32; } impl SimdElement for i64 { type Mask = i64; } /// Construction of integer vectors from floats by truncation pub trait SimdCvtTruncate { fn truncate_from(x: T) -> Self; } /// Construction of floating point vectors from integers pub trait SimdCvtFloat { fn float_from(x: T) -> Self; }