pax_global_header00006660000000000000000000000064147373547370014536gustar00rootroot0000000000000052 comment=d5ada752ca63e527022c4663337163f3579946f7 wide-0.7.32/000077500000000000000000000000001473735473700125575ustar00rootroot00000000000000wide-0.7.32/.cargo-ci/000077500000000000000000000000001473735473700143215ustar00rootroot00000000000000wide-0.7.32/.cargo-ci/config000066400000000000000000000007531473735473700155160ustar00rootroot00000000000000 # Note: Cargo doesn't carry these settings to dependencies. They only affect the # process of directly building the crate. This is so that we can easily use # `cargo test` and `cargo doc` and so on during development. [build] # This can cause weirdness! rustflags = ["-Ctarget-cpu=native"] rustdocflags = ["-Ctarget-cpu=native"] [target.wasm32-wasi] runner = "wasmtime run --wasm-features all --dir ." rustflags = ["-Ctarget-feature=+simd128,+bulk-memory,+nontrapping-fptoint,+sign-ext"] wide-0.7.32/.github/000077500000000000000000000000001473735473700141175ustar00rootroot00000000000000wide-0.7.32/.github/FUNDING.yml000066400000000000000000000001021473735473700157250ustar00rootroot00000000000000# These are supported funding model platforms github: [Lokathor] wide-0.7.32/.github/workflows/000077500000000000000000000000001473735473700161545ustar00rootroot00000000000000wide-0.7.32/.github/workflows/rust.yml000066400000000000000000000112241473735473700176740ustar00rootroot00000000000000name: Rust on: push: {} pull_request: {} jobs: build_test: runs-on: ${{ matrix.rust.os }} strategy: matrix: rust: # x86 without sse/sse2 on by default - { target: i586-pc-windows-msvc, toolchain: "1.61", os: windows-latest } - { target: i586-pc-windows-msvc, toolchain: stable, os: windows-latest } - { target: i586-pc-windows-msvc, toolchain: beta, os: windows-latest } - { target: i586-pc-windows-msvc, toolchain: nightly, os: windows-latest } # x86 - { target: i686-pc-windows-msvc, toolchain: "1.61", os: windows-latest } - { target: i686-pc-windows-msvc, toolchain: stable, os: windows-latest } - { target: i686-pc-windows-msvc, toolchain: beta, os: windows-latest } - { target: i686-pc-windows-msvc, toolchain: nightly, os: windows-latest } # x86_64 - { target: x86_64-unknown-linux-gnu, toolchain: "1.61", os: ubuntu-latest } - { target: x86_64-unknown-linux-gnu, toolchain: stable, os: ubuntu-latest } - { target: x86_64-unknown-linux-gnu, toolchain: beta, os: ubuntu-latest } - { target: x86_64-unknown-linux-gnu, toolchain: nightly, os: ubuntu-latest } # aarch64 - { target: aarch64-apple-darwin, toolchain: "1.61", os: macos-latest } - { target: aarch64-apple-darwin, toolchain: stable, os: macos-latest } - { target: aarch64-apple-darwin, toolchain: beta, os: macos-latest } - { target: aarch64-apple-darwin, toolchain: nightly, os: macos-latest } # wasm32 #- { target: wasm32-wasi, toolchain: "1.61", os: ubuntu-latest, wasmtime: v5.0.0 } #- { target: wasm32-wasi, toolchain: stable, os: ubuntu-latest, wasmtime: v5.0.0 } #- { target: wasm32-wasip1, toolchain: beta, os: ubuntu-latest, wasmtime: v5.0.0 } #- { target: wasm32-wasip1, toolchain: nightly, os: ubuntu-latest, wasmtime: v5.0.0 } steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.rust.toolchain }} target: ${{ matrix.rust.target }} - name: Install wasmtime if: matrix.rust.target == 'wasm32-wasi' || matrix.rust.target == 'wasm32-wasip1' run: | curl https://wasmtime.dev/install.sh -sSf | bash -s -- --version ${{ matrix.rust.wasmtime }} echo "$HOME/.wasmtime/bin" >> $GITHUB_PATH - name: Build the crate run: cargo build --target ${{ matrix.rust.target }} - name: Test with default CPU features + No Default Cargo Features env: CARGO_TARGET_WASM32_WASI_RUNNER: wasmtime run --wasm-features all --dir . run: cargo test --target ${{ matrix.rust.target }} --no-default-features - name: Test with default CPU features + All Cargo Features env: CARGO_TARGET_WASM32_WASI_RUNNER: wasmtime run --wasm-features all --dir . run: cargo test --target ${{ matrix.rust.target }} --all-features - name: Build the crate with SSE4.1 (the "native" of CI will be above this) if: matrix.rust.os == 'ubuntu-latest' && matrix.rust.target == 'x86_64-unknown-linux-gnu' run: RUSTFLAGS="-Ctarget-feature=+sse4.1" cargo build --target ${{ matrix.rust.target }} - name: switch over to native cpu features run: mv .cargo-ci .cargo - name: Test with 'native' CPU features + No Default Cargo Features run: cargo test --target ${{ matrix.rust.target }} --no-default-features - name: Test with 'native' CPU features + All Cargo Features run: cargo test --target ${{ matrix.rust.target }} --all-features #cross_compile_aarch64: # runs-on: ubuntu-latest # strategy: # matrix: # rust: ["1.61", stable, nightly] # features: ["", "std"] # steps: # - name: Installing emulator and linker # run: | # sudo apt-get update # sudo apt-get install qemu binfmt-support qemu-user-static gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu # - name: Installing Rust toolchain # uses: dtolnay/rust-toolchain@master # with: # toolchain: ${{ matrix.rust }} # target: aarch64-unknown-linux-musl # - uses: actions/checkout@v4 # - name: build # run: > # cargo build --verbose --no-default-features --target aarch64-unknown-linux-musl --features "$FEATURES" # env: # CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_LINKER: aarch64-linux-gnu-gcc # - name: test # run: > # cargo test --tests --benches --no-default-features --target aarch64-unknown-linux-musl --features "$FEATURES" # env: # FEATURES: ${{ matrix.features }} # CARGO_TARGET_AARCH64_UNKNOWN_LINUX_MUSL_LINKER: aarch64-linux-gnu-gcc wide-0.7.32/.gitignore000066400000000000000000000001311473735473700145420ustar00rootroot00000000000000 Cargo.lock /target/ /.vscode/ # These are backup files generated by rustfmt **/*.rs.bk wide-0.7.32/Cargo.toml000066400000000000000000000015701473735473700145120ustar00rootroot00000000000000[package] name = "wide" description = "A crate to help you go wide." version = "0.7.32" authors = ["Lokathor "] repository = "https://github.com/Lokathor/wide" readme = "README.md" keywords = ["simd", "wide", "lokathor"] categories = ["data-structures", "hardware-support"] edition = "2018" license = "Zlib OR Apache-2.0 OR MIT" # Aarch64 needs 1.59 while others need 1.56 # When updating, also update CI workflows and the badge in the README. rust-version = "1.61" [features] default = ["std"] # Activate `std` within the crate. Currently this gives a much faster `sqrt` # impl when an explicit hardware sqrt isn't available. std = [] serde = ["dep:serde"] [dependencies] safe_arch = { version = "0.7", features = ["bytemuck"] } serde = { version = "1", default-features = false, optional = true } bytemuck = "1" [dev-dependencies] bincode = { version = "1.3.3" } wide-0.7.32/LICENSE-ZLIB.md000066400000000000000000000015231473735473700147220ustar00rootroot00000000000000Copyright (c) 2020 Daniel "Lokathor" Gee. This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. wide-0.7.32/README.md000066400000000000000000000015251473735473700140410ustar00rootroot00000000000000[![License:Zlib](https://img.shields.io/badge/License-Zlib-brightgreen.svg)](https://opensource.org/licenses/Zlib) ![Minimum Rust Version](https://img.shields.io/badge/Min%20Rust-1.61-green.svg) [![crates.io](https://img.shields.io/crates/v/wide.svg)](https://crates.io/crates/wide) [![docs.rs](https://docs.rs/wide/badge.svg)](https://docs.rs/wide/) # wide A crate to help you go wide. Specifically, this has portable "wide" data types that do their best to be SIMD when possible. On `x86`, `x86_64`, `wasm32` and `aarch64 neon` this is done with explicit intrinsic usage (via [safe_arch](https://docs.rs/safe_arch)), and on other architectures this is done by carefully writing functions so that LLVM hopefully does the right thing. When Rust stabilizes more explicit intrinsics then they can go into `safe_arch` and then they can get used here. wide-0.7.32/rustfmt.toml000066400000000000000000000002621473735473700151600ustar00rootroot00000000000000merge_imports = true use_try_shorthand = true tab_spaces = 2 max_width = 80 color = "Never" use_small_heuristics = "Max" format_code_in_doc_comments = true wrap_comments = true wide-0.7.32/src/000077500000000000000000000000001473735473700133465ustar00rootroot00000000000000wide-0.7.32/src/f32x4_.rs000066400000000000000000001446501473735473700147330ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(16))] pub struct f32x4 { pub(crate) sse: m128 } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct f32x4 { pub(crate) simd: v128 } impl Default for f32x4 { fn default() -> Self { Self::splat(0.0) } } impl PartialEq for f32x4 { fn eq(&self, other: &Self) -> bool { u32x4_all_true(f32x4_eq(self.simd, other.simd)) } } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct f32x4 { pub(crate) neon : float32x4_t } impl Default for f32x4 { #[inline] #[must_use] fn default() -> Self { unsafe { Self { neon: vdupq_n_f32(0.0)} } } } impl PartialEq for f32x4 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { vminvq_u32(vceqq_f32(self.neon, other.neon))==u32::MAX } } } } else { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(16))] pub struct f32x4 { pub(crate) arr: [f32;4] } } } macro_rules! const_f32_as_f32x4 { ($i:ident, $f:expr) => { #[allow(non_upper_case_globals)] pub const $i: f32x4 = f32x4::new([$f; 4]); }; } impl f32x4 { const_f32_as_f32x4!(ONE, 1.0); const_f32_as_f32x4!(ZERO, 0.0); const_f32_as_f32x4!(HALF, 0.5); const_f32_as_f32x4!(E, core::f32::consts::E); const_f32_as_f32x4!(FRAC_1_PI, core::f32::consts::FRAC_1_PI); const_f32_as_f32x4!(FRAC_2_PI, core::f32::consts::FRAC_2_PI); const_f32_as_f32x4!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI); const_f32_as_f32x4!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2); const_f32_as_f32x4!(FRAC_PI_2, core::f32::consts::FRAC_PI_2); const_f32_as_f32x4!(FRAC_PI_3, core::f32::consts::FRAC_PI_3); const_f32_as_f32x4!(FRAC_PI_4, core::f32::consts::FRAC_PI_4); const_f32_as_f32x4!(FRAC_PI_6, core::f32::consts::FRAC_PI_6); const_f32_as_f32x4!(FRAC_PI_8, core::f32::consts::FRAC_PI_8); const_f32_as_f32x4!(LN_2, core::f32::consts::LN_2); const_f32_as_f32x4!(LN_10, core::f32::consts::LN_10); const_f32_as_f32x4!(LOG2_E, core::f32::consts::LOG2_E); const_f32_as_f32x4!(LOG10_E, core::f32::consts::LOG10_E); const_f32_as_f32x4!(LOG10_2, core::f32::consts::LOG10_2); const_f32_as_f32x4!(LOG2_10, core::f32::consts::LOG2_10); const_f32_as_f32x4!(PI, core::f32::consts::PI); const_f32_as_f32x4!(SQRT_2, core::f32::consts::SQRT_2); const_f32_as_f32x4!(TAU, core::f32::consts::TAU); } unsafe impl Zeroable for f32x4 {} unsafe impl Pod for f32x4 {} impl Add for f32x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: add_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_f32(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0] + rhs.arr[0], self.arr[1] + rhs.arr[1], self.arr[2] + rhs.arr[2], self.arr[3] + rhs.arr[3], ]} } } } } impl Sub for f32x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: sub_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vsubq_f32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0] - rhs.arr[0], self.arr[1] - rhs.arr[1], self.arr[2] - rhs.arr[2], self.arr[3] - rhs.arr[3], ]} } } } } impl Mul for f32x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: mul_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_mul(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmulq_f32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0] * rhs.arr[0], self.arr[1] * rhs.arr[1], self.arr[2] * rhs.arr[2], self.arr[3] * rhs.arr[3], ]} } } } } impl Div for f32x4 { type Output = Self; #[inline] #[must_use] fn div(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: div_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_div(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vdivq_f32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0] / rhs.arr[0], self.arr[1] / rhs.arr[1], self.arr[2] / rhs.arr[2], self.arr[3] / rhs.arr[3], ]} } } } } impl Add for f32x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: f32) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for f32x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: f32) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for f32x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: f32) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Div for f32x4 { type Output = Self; #[inline] #[must_use] fn div(self, rhs: f32) -> Self::Output { self.div(Self::splat(rhs)) } } impl Add for f32 { type Output = f32x4; #[inline] #[must_use] fn add(self, rhs: f32x4) -> Self::Output { f32x4::splat(self).add(rhs) } } impl Sub for f32 { type Output = f32x4; #[inline] #[must_use] fn sub(self, rhs: f32x4) -> Self::Output { f32x4::splat(self).sub(rhs) } } impl Mul for f32 { type Output = f32x4; #[inline] #[must_use] fn mul(self, rhs: f32x4) -> Self::Output { f32x4::splat(self).mul(rhs) } } impl Div for f32 { type Output = f32x4; #[inline] #[must_use] fn div(self, rhs: f32x4) -> Self::Output { f32x4::splat(self).div(rhs) } } impl BitAnd for f32x4 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: bitand_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }} } else { Self { arr: [ f32::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()), f32::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()), f32::from_bits(self.arr[2].to_bits() & rhs.arr[2].to_bits()), f32::from_bits(self.arr[3].to_bits() & rhs.arr[3].to_bits()), ]} } } } } impl BitOr for f32x4 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: bitor_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }} } else { Self { arr: [ f32::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()), f32::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()), f32::from_bits(self.arr[2].to_bits() | rhs.arr[2].to_bits()), f32::from_bits(self.arr[3].to_bits() | rhs.arr[3].to_bits()), ]} } } } } impl BitXor for f32x4 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: bitxor_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(self.neon), vreinterpretq_u32_f32(rhs.neon))) }} } else { Self { arr: [ f32::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()), f32::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()), f32::from_bits(self.arr[2].to_bits() ^ rhs.arr[2].to_bits()), f32::from_bits(self.arr[3].to_bits() ^ rhs.arr[3].to_bits()), ]} } } } } impl CmpEq for f32x4 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: cmp_eq_mask_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(vceqq_f32(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] == rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[1] == rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[2] == rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[3] == rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, ]} } } } } impl CmpGe for f32x4 { type Output = Self; #[inline] #[must_use] fn cmp_ge(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: cmp_ge_mask_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_ge(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(vcgeq_f32(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] >= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[1] >= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[2] >= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[3] >= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, ]} } } } } impl CmpGt for f32x4 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: cmp_gt_mask_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_gt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(vcgtq_f32(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] > rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[1] > rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[2] > rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[3] > rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, ]} } } } } impl CmpNe for f32x4 { type Output = Self; #[inline] #[must_use] fn cmp_ne(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: cmp_neq_mask_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_ne(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, rhs.neon))) }} } else { Self { arr: [ if self.arr[0] != rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[1] != rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[2] != rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[3] != rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, ]} } } } } impl CmpLe for f32x4 { type Output = Self; #[inline] #[must_use] fn cmp_le(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: cmp_le_mask_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_le(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(vcleq_f32(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] <= rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[1] <= rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[2] <= rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[3] <= rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, ]} } } } } impl CmpLt for f32x4 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse")] { Self { sse: cmp_lt_mask_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_lt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(vcltq_f32(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] < rhs.arr[0] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[1] < rhs.arr[1] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[2] < rhs.arr[2] { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[3] < rhs.arr[3] { f32::from_bits(u32::MAX) } else { 0.0 }, ]} } } } } impl f32x4 { #[inline] #[must_use] pub const fn new(array: [f32; 4]) -> Self { #[allow(non_upper_case_globals)] unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_m128(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else { generic_bit_blend(self, t, f) } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="simd128")] { Self { simd: f32x4_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vabsq_f32(self.neon) }} } else { let non_sign_bits = f32x4::from(f32::from_bits(i32::MAX as u32)); self & non_sign_bits } } } #[inline] #[must_use] pub fn floor(self) -> Self { pick! { if #[cfg(target_feature="simd128")] { Self { simd: f32x4_floor(self.simd) } } else if #[cfg(target_feature="sse4.1")] { Self { sse: floor_m128(self.sse) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vrndmq_f32(self.neon) }} } else if #[cfg(feature="std")] { let base: [f32; 4] = cast(self); cast(base.map(|val| val.floor())) } else { let base: [f32; 4] = cast(self); let rounded: [f32; 4] = cast(self.round()); cast([ if base[0] < rounded[0] { rounded[0] - 1.0 } else { rounded[0] }, if base[1] < rounded[1] { rounded[1] - 1.0 } else { rounded[1] }, if base[2] < rounded[2] { rounded[2] - 1.0 } else { rounded[2] }, if base[3] < rounded[3] { rounded[3] - 1.0 } else { rounded[3] }, ]) } } } #[inline] #[must_use] pub fn ceil(self) -> Self { pick! { if #[cfg(target_feature="simd128")] { Self { simd: f32x4_ceil(self.simd) } } else if #[cfg(target_feature="sse4.1")] { Self { sse: ceil_m128(self.sse) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vrndpq_f32(self.neon) }} } else if #[cfg(feature="std")] { let base: [f32; 4] = cast(self); cast(base.map(|val| val.ceil())) } else { let base: [f32; 4] = cast(self); let rounded: [f32; 4] = cast(self.round()); cast([ if base[0] > rounded[0] { rounded[0] + 1.0 } else { rounded[0] }, if base[1] > rounded[1] { rounded[1] + 1.0 } else { rounded[1] }, if base[2] > rounded[2] { rounded[2] + 1.0 } else { rounded[2] }, if base[3] > rounded[3] { rounded[3] + 1.0 } else { rounded[3] }, ]) } } } /// Calculates the lanewise maximum of both vectors. This is a faster /// implementation than `max`, but it doesn't specify any behavior if NaNs are /// involved. #[inline] #[must_use] pub fn fast_max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse")] { Self { sse: max_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_pmax(self.simd, rhs.simd), } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxq_f32(self.neon, rhs.neon) }} } else { Self { arr: [ if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] }, if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] }, if self.arr[2] < rhs.arr[2] { rhs.arr[2] } else { self.arr[2] }, if self.arr[3] < rhs.arr[3] { rhs.arr[3] } else { self.arr[3] }, ]} } } } /// Calculates the lanewise maximum of both vectors. If either lane is NaN, /// the other lane gets chosen. Use `fast_max` for a faster implementation /// that doesn't handle NaNs. #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse")] { // max_m128 seems to do rhs < self ? self : rhs. So if there's any NaN // involved, it chooses rhs, so we need to specifically check rhs for // NaN. rhs.is_nan().blend(self, Self { sse: max_m128(self.sse, rhs.sse) }) } else if #[cfg(target_feature="simd128")] { // WASM has two max intrinsics: // - max: This propagates NaN, that's the opposite of what we need. // - pmax: This is defined as self < rhs ? rhs : self, which basically // chooses self if either is NaN. // // pmax is what we want, but we need to specifically check self for NaN. Self { simd: v128_bitselect( rhs.simd, f32x4_pmax(self.simd, rhs.simd), f32x4_ne(self.simd, self.simd), // NaN check ) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxnmq_f32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].max(rhs.arr[0]), self.arr[1].max(rhs.arr[1]), self.arr[2].max(rhs.arr[2]), self.arr[3].max(rhs.arr[3]), ]} } } } /// Calculates the lanewise minimum of both vectors. This is a faster /// implementation than `min`, but it doesn't specify any behavior if NaNs are /// involved. #[inline] #[must_use] pub fn fast_min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse")] { Self { sse: min_m128(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_pmin(self.simd, rhs.simd), } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminq_f32(self.neon, rhs.neon) }} } else { Self { arr: [ if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] }, if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] }, if self.arr[2] < rhs.arr[2] { self.arr[2] } else { rhs.arr[2] }, if self.arr[3] < rhs.arr[3] { self.arr[3] } else { rhs.arr[3] }, ]} } } } /// Calculates the lanewise minimum of both vectors. If either lane is NaN, /// the other lane gets chosen. Use `fast_min` for a faster implementation /// that doesn't handle NaNs. #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse")] { // min_m128 seems to do self < rhs ? self : rhs. So if there's any NaN // involved, it chooses rhs, so we need to specifically check rhs for // NaN. rhs.is_nan().blend(self, Self { sse: min_m128(self.sse, rhs.sse) }) } else if #[cfg(target_feature="simd128")] { // WASM has two min intrinsics: // - min: This propagates NaN, that's the opposite of what we need. // - pmin: This is defined as rhs < self ? rhs : self, which basically // chooses self if either is NaN. // // pmin is what we want, but we need to specifically check self for NaN. Self { simd: v128_bitselect( rhs.simd, f32x4_pmin(self.simd, rhs.simd), f32x4_ne(self.simd, self.simd), // NaN check ) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminnmq_f32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].min(rhs.arr[0]), self.arr[1].min(rhs.arr[1]), self.arr[2].min(rhs.arr[2]), self.arr[3].min(rhs.arr[3]), ]} } } } #[inline] #[must_use] pub fn is_nan(self) -> Self { pick! { if #[cfg(target_feature="sse")] { Self { sse: cmp_unord_mask_m128(self.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_ne(self.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f32_u32(vmvnq_u32(vceqq_f32(self.neon, self.neon))) }} } else { Self { arr: [ if self.arr[0].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[1].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[2].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, if self.arr[3].is_nan() { f32::from_bits(u32::MAX) } else { 0.0 }, ]} } } } #[inline] #[must_use] pub fn is_finite(self) -> Self { let shifted_exp_mask = u32x4::from(0xFF000000); let u: u32x4 = cast(self); let shift_u = u << 1_u64; let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask); cast(out) } #[inline] #[must_use] pub fn is_inf(self) -> Self { let shifted_inf = u32x4::from(0xFF000000); let u: u32x4 = cast(self); let shift_u = u << 1_u64; let out = (shift_u).cmp_eq(shifted_inf); cast(out) } #[inline] #[must_use] pub fn round(self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: round_m128::<{round_op!(Nearest)}>(self.sse) } } else if #[cfg(target_feature="sse2")] { let mi: m128i = convert_to_i32_m128i_from_m128(self.sse); let f: f32x4 = f32x4 { sse: convert_to_m128_from_i32_m128i(mi) }; let i: i32x4 = cast(mi); let mask: f32x4 = cast(i.cmp_eq(i32x4::from(0x80000000_u32 as i32))); mask.blend(self, f) } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_nearest(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vrndnq_f32(self.neon) }} } else { // Note(Lokathor): This software fallback is probably very slow compared // to having a hardware option available, even just the sse2 version is // better than this. Oh well. let to_int = f32x4::from(1.0 / f32::EPSILON); let u: u32x4 = cast(self); let e: i32x4 = cast((u >> 23) & u32x4::from(0xff)); let mut y: f32x4; let no_op_magic = i32x4::from(0x7f + 23); let no_op_mask: f32x4 = cast(e.cmp_gt(no_op_magic) | e.cmp_eq(no_op_magic)); let no_op_val: f32x4 = self; let zero_magic = i32x4::from(0x7f - 1); let zero_mask: f32x4 = cast(e.cmp_lt(zero_magic)); let zero_val: f32x4 = self * f32x4::from(0.0); let neg_bit: f32x4 = cast(cast::(u).cmp_lt(i32x4::default())); let x: f32x4 = neg_bit.blend(-self, self); y = x + to_int - to_int - x; y = y.cmp_gt(f32x4::from(0.5)).blend( y + x - f32x4::from(-1.0), y.cmp_lt(f32x4::from(-0.5)).blend(y + x + f32x4::from(1.0), y + x), ); y = neg_bit.blend(-y, y); no_op_mask.blend(no_op_val, zero_mask.blend(zero_val, y)) } } } /// Rounds each lane into an integer. This is a faster implementation than /// `round_int`, but it doesn't handle out of range values or NaNs. For those /// values you get implementation defined behavior. #[inline] #[must_use] pub fn fast_round_int(self) -> i32x4 { pick! { if #[cfg(target_feature="sse2")] { cast(convert_to_i32_m128i_from_m128(self.sse)) } else { self.round_int() } } } /// Rounds each lane into an integer. This saturates out of range values and /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that /// doesn't handle out of range values or NaNs. #[inline] #[must_use] pub fn round_int(self) -> i32x4 { pick! { if #[cfg(target_feature="sse2")] { // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 let non_nan_mask = self.cmp_eq(self); let non_nan = self & non_nan_mask; let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0))); let cast: i32x4 = cast(convert_to_i32_m128i_from_m128(non_nan.sse)); flip_to_max ^ cast } else if #[cfg(target_feature="simd128")] { cast(Self { simd: i32x4_trunc_sat_f32x4(f32x4_nearest(self.simd)) }) } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtnq_s32_f32(self.neon)) }}) } else { let rounded: [f32; 4] = cast(self.round()); cast([ rounded[0] as i32, rounded[1] as i32, rounded[2] as i32, rounded[3] as i32, ]) } } } /// Truncates each lane into an integer. This is a faster implementation than /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those /// values you get implementation defined behavior. #[inline] #[must_use] pub fn fast_trunc_int(self) -> i32x4 { pick! { if #[cfg(target_feature="sse2")] { cast(truncate_m128_to_m128i(self.sse)) } else { self.trunc_int() } } } /// Truncates each lane into an integer. This saturates out of range values /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation /// that doesn't handle out of range values or NaNs. #[inline] #[must_use] pub fn trunc_int(self) -> i32x4 { pick! { if #[cfg(target_feature="sse2")] { // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 let non_nan_mask = self.cmp_eq(self); let non_nan = self & non_nan_mask; let flip_to_max: i32x4 = cast(self.cmp_ge(Self::splat(2147483648.0))); let cast: i32x4 = cast(truncate_m128_to_m128i(non_nan.sse)); flip_to_max ^ cast } else if #[cfg(target_feature="simd128")] { cast(Self { simd: i32x4_trunc_sat_f32x4(self.simd) }) } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ cast(unsafe {Self { neon: vreinterpretq_f32_s32(vcvtq_s32_f32(self.neon)) }}) } else { let n: [f32;4] = cast(self); cast([ n[0] as i32, n[1] as i32, n[2] as i32, n[3] as i32, ]) } } } #[inline] #[must_use] pub fn mul_add(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="sse2",target_feature="fma"))] { Self { sse: fused_mul_add_m128(self.sse, m.sse, a.sse) } } else { (self * m) + a } } } #[inline] #[must_use] pub fn mul_sub(self, m: Self, s: Self) -> Self { pick! { if #[cfg(all(target_feature="sse2",target_feature="fma"))] { Self { sse: fused_mul_sub_m128(self.sse, m.sse, s.sse) } } else { (self * m) - s } } } #[inline] #[must_use] pub fn mul_neg_add(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="sse2",target_feature="fma"))] { Self { sse: fused_mul_neg_add_m128(self.sse, m.sse, a.sse) } } else { a - (self * m) } } } #[inline] #[must_use] pub fn mul_neg_sub(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="sse2",target_feature="fma"))] { Self { sse: fused_mul_neg_sub_m128(self.sse, m.sse, a.sse) } } else { -(self * m) - a } } } #[inline] #[must_use] pub fn flip_signs(self, signs: Self) -> Self { self ^ (signs & Self::from(-0.0)) } #[inline] #[must_use] pub fn copysign(self, sign: Self) -> Self { let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1)); (self & magnitude_mask) | (sign & Self::from(-0.0)) } #[inline] pub fn asin_acos(self) -> (Self, Self) { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x4!(P4asinf, 4.2163199048E-2); const_f32_as_f32x4!(P3asinf, 2.4181311049E-2); const_f32_as_f32x4!(P2asinf, 4.5470025998E-2); const_f32_as_f32x4!(P1asinf, 7.4953002686E-2); const_f32_as_f32x4!(P0asinf, 1.6666752422E-1); let xa = self.abs(); let big = xa.cmp_ge(f32x4::splat(0.5)); let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa); let x2 = xa * xa; let x3 = big.blend(x1, x2); let xb = x1.sqrt(); let x4 = big.blend(xb, xa); let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); let z = z.mul_add(x3 * x4, x4); let z1 = z + z; // acos let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1); let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self); let acos = big.blend(z3, z4); // asin let z3 = f32x4::FRAC_PI_2 - z1; let asin = big.blend(z3, z); let asin = asin.flip_signs(self); (asin, acos) } #[inline] pub fn asin(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x4!(P4asinf, 4.2163199048E-2); const_f32_as_f32x4!(P3asinf, 2.4181311049E-2); const_f32_as_f32x4!(P2asinf, 4.5470025998E-2); const_f32_as_f32x4!(P1asinf, 7.4953002686E-2); const_f32_as_f32x4!(P0asinf, 1.6666752422E-1); let xa = self.abs(); let big = xa.cmp_ge(f32x4::splat(0.5)); let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa); let x2 = xa * xa; let x3 = big.blend(x1, x2); let xb = x1.sqrt(); let x4 = big.blend(xb, xa); let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); let z = z.mul_add(x3 * x4, x4); let z1 = z + z; // asin let z3 = f32x4::FRAC_PI_2 - z1; let asin = big.blend(z3, z); let asin = asin.flip_signs(self); asin } #[inline] #[must_use] pub fn acos(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x4!(P4asinf, 4.2163199048E-2); const_f32_as_f32x4!(P3asinf, 2.4181311049E-2); const_f32_as_f32x4!(P2asinf, 4.5470025998E-2); const_f32_as_f32x4!(P1asinf, 7.4953002686E-2); const_f32_as_f32x4!(P0asinf, 1.6666752422E-1); let xa = self.abs(); let big = xa.cmp_ge(f32x4::splat(0.5)); let x1 = f32x4::splat(0.5) * (f32x4::ONE - xa); let x2 = xa * xa; let x3 = big.blend(x1, x2); let xb = x1.sqrt(); let x4 = big.blend(xb, xa); let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); let z = z.mul_add(x3 * x4, x4); let z1 = z + z; // acos let z3 = self.cmp_lt(f32x4::ZERO).blend(f32x4::PI - z1, z1); let z4 = f32x4::FRAC_PI_2 - z.flip_signs(self); let acos = big.blend(z3, z4); acos } #[inline] pub fn atan(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x4!(P3atanf, 8.05374449538E-2); const_f32_as_f32x4!(P2atanf, -1.38776856032E-1); const_f32_as_f32x4!(P1atanf, 1.99777106478E-1); const_f32_as_f32x4!(P0atanf, -3.33329491539E-1); let t = self.abs(); // small: z = t / 1.0; // medium: z = (t-1.0) / (t+1.0); // big: z = -1.0 / t; let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE); let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE); let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2); s = notsmal & s; let mut a = notbig & t; a = notsmal.blend(a - Self::ONE, a); let mut b = notbig & Self::ONE; b = notsmal.blend(b + t, b); let z = a / b; let zz = z * z; // Taylor expansion let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf); re = re.mul_add(zz * z, z) + s; // get sign bit re = (self.sign_bit()).blend(-re, re); re } #[inline] pub fn atan2(self, x: Self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x4!(P3atanf, 8.05374449538E-2); const_f32_as_f32x4!(P2atanf, -1.38776856032E-1); const_f32_as_f32x4!(P1atanf, 1.99777106478E-1); const_f32_as_f32x4!(P0atanf, -3.33329491539E-1); let y = self; // move in first octant let x1 = x.abs(); let y1 = y.abs(); let swapxy = y1.cmp_gt(x1); // swap x and y if y1 > x1 let mut x2 = swapxy.blend(y1, x1); let mut y2 = swapxy.blend(x1, y1); // check for special case: x and y are both +/- INF let both_infinite = x.is_inf() & y.is_inf(); if both_infinite.any() { let minus_one = -Self::ONE; x2 = both_infinite.blend(x2 & minus_one, x2); y2 = both_infinite.blend(y2 & minus_one, y2); } // x = y = 0 will produce NAN. No problem, fixed below let t = y2 / x2; // small: z = t / 1.0; // medium: z = (t-1.0) / (t+1.0); let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE); let a = notsmal.blend(t - Self::ONE, t); let b = notsmal.blend(t + Self::ONE, Self::ONE); let s = notsmal & Self::FRAC_PI_4; let z = a / b; let zz = z * z; // Taylor expansion let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf); re = re.mul_add(zz * z, z) + s; // move back in place re = swapxy.blend(Self::FRAC_PI_2 - re, re); re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re); re = (x.sign_bit()).blend(Self::PI - re, re); // get sign bit re = (y.sign_bit()).blend(-re, re); re } #[inline] #[must_use] pub fn sin_cos(self) -> (Self, Self) { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x4!(DP1F, 0.78515625_f32 * 2.0); const_f32_as_f32x4!(DP2F, 2.4187564849853515625E-4_f32 * 2.0); const_f32_as_f32x4!(DP3F, 3.77489497744594108E-8_f32 * 2.0); const_f32_as_f32x4!(P0sinf, -1.6666654611E-1); const_f32_as_f32x4!(P1sinf, 8.3321608736E-3); const_f32_as_f32x4!(P2sinf, -1.9515295891E-4); const_f32_as_f32x4!(P0cosf, 4.166664568298827E-2); const_f32_as_f32x4!(P1cosf, -1.388731625493765E-3); const_f32_as_f32x4!(P2cosf, 2.443315711809948E-5); const_f32_as_f32x4!(TWO_OVER_PI, 2.0 / core::f32::consts::PI); let xa = self.abs(); // Find quadrant let y = (xa * TWO_OVER_PI).round(); let q: i32x4 = y.round_int(); let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa))); let x2 = x * x; let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x; let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2) + f32x4::from(0.5).mul_neg_add(x2, f32x4::from(1.0)); let swap = !(q & i32x4::from(1)).cmp_eq(i32x4::from(0)); let mut overflow: f32x4 = cast(q.cmp_gt(i32x4::from(0x2000000))); overflow &= xa.is_finite(); s = overflow.blend(f32x4::from(0.0), s); c = overflow.blend(f32x4::from(1.0), c); // calc sin let mut sin1 = cast::<_, f32x4>(swap).blend(c, s); let sign_sin: i32x4 = (q << 30) ^ cast::<_, i32x4>(self); sin1 = sin1.flip_signs(cast(sign_sin)); // calc cos let mut cos1 = cast::<_, f32x4>(swap).blend(s, c); let sign_cos: i32x4 = ((q + i32x4::from(1)) & i32x4::from(2)) << 30; cos1 ^= cast::<_, f32x4>(sign_cos); (sin1, cos1) } #[inline] #[must_use] pub fn sin(self) -> Self { let (s, _) = self.sin_cos(); s } #[inline] #[must_use] pub fn cos(self) -> Self { let (_, c) = self.sin_cos(); c } #[inline] #[must_use] pub fn tan(self) -> Self { let (s, c) = self.sin_cos(); s / c } #[inline] #[must_use] pub fn to_degrees(self) -> Self { const_f32_as_f32x4!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI); self * RAD_TO_DEG_RATIO } #[inline] #[must_use] pub fn to_radians(self) -> Self { const_f32_as_f32x4!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32); self * DEG_TO_RAD_RATIO } #[inline] #[must_use] pub fn recip(self) -> Self { pick! { if #[cfg(target_feature="sse")] { Self { sse: reciprocal_m128(self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_div(f32x4_splat(1.0), self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), self.neon) }} } else { Self { arr: [ 1.0 / self.arr[0], 1.0 / self.arr[1], 1.0 / self.arr[2], 1.0 / self.arr[3], ]} } } } #[inline] #[must_use] pub fn recip_sqrt(self) -> Self { pick! { if #[cfg(target_feature="sse")] { Self { sse: reciprocal_sqrt_m128(self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_div(f32x4_splat(1.0), f32x4_sqrt(self.simd)) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vdivq_f32(vdupq_n_f32(1.0), vsqrtq_f32(self.neon)) }} } else if #[cfg(feature="std")] { Self { arr: [ 1.0 / self.arr[0].sqrt(), 1.0 / self.arr[1].sqrt(), 1.0 / self.arr[2].sqrt(), 1.0 / self.arr[3].sqrt(), ]} } else { Self { arr: [ 1.0 / software_sqrt(self.arr[0] as f64) as f32, 1.0 / software_sqrt(self.arr[1] as f64) as f32, 1.0 / software_sqrt(self.arr[2] as f64) as f32, 1.0 / software_sqrt(self.arr[3] as f64) as f32, ]} } } } #[inline] #[must_use] pub fn sqrt(self) -> Self { pick! { if #[cfg(target_feature="sse")] { Self { sse: sqrt_m128(self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_sqrt(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vsqrtq_f32(self.neon) }} } else if #[cfg(feature="std")] { Self { arr: [ self.arr[0].sqrt(), self.arr[1].sqrt(), self.arr[2].sqrt(), self.arr[3].sqrt(), ]} } else { Self { arr: [ software_sqrt(self.arr[0] as f64) as f32, software_sqrt(self.arr[1] as f64) as f32, software_sqrt(self.arr[2] as f64) as f32, software_sqrt(self.arr[3] as f64) as f32, ]} } } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="sse")] { move_mask_m128(self.sse) } else if #[cfg(target_feature="simd128")] { u32x4_bitmask(self.simd) as i32 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { // set all to 1 if top bit is set, else 0 let masked = vcltq_s32( vreinterpretq_s32_f32(self.neon), vdupq_n_s32(0)); // select the right bit out of each lane let selectbit : uint32x4_t = core::intrinsics::transmute([1u32, 2, 4, 8]); let r = vandq_u32(masked, selectbit); // horizontally add the 16-bit lanes vaddvq_u32(r) as i32 } } else { (((self.arr[0].to_bits() as i32) < 0) as i32) << 0 | (((self.arr[1].to_bits() as i32) < 0) as i32) << 1 | (((self.arr[2].to_bits() as i32) < 0) as i32) << 2 | (((self.arr[3].to_bits() as i32) < 0) as i32) << 3 } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="simd128")] { v128_any_true(self.simd) } else { self.move_mask() != 0 } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="simd128")] { u32x4_all_true(self.simd) } else { // four lanes self.move_mask() == 0b1111 } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] fn vm_pow2n(self) -> Self { const_f32_as_f32x4!(pow2_23, 8388608.0); const_f32_as_f32x4!(bias, 127.0); let a = self + (bias + pow2_23); let c = cast::<_, i32x4>(a) << 23; cast::<_, f32x4>(c) } /// Calculate the exponent of a packed `f32x4` #[inline] #[must_use] pub fn exp(self) -> Self { const_f32_as_f32x4!(P0, 1.0 / 2.0); const_f32_as_f32x4!(P1, 1.0 / 6.0); const_f32_as_f32x4!(P2, 1. / 24.); const_f32_as_f32x4!(P3, 1. / 120.); const_f32_as_f32x4!(P4, 1. / 720.); const_f32_as_f32x4!(P5, 1. / 5040.); const_f32_as_f32x4!(LN2D_HI, 0.693359375); const_f32_as_f32x4!(LN2D_LO, -2.12194440e-4); let max_x = f32x4::from(87.3); let r = (self * Self::LOG2_E).round(); let x = r.mul_neg_add(LN2D_HI, self); let x = r.mul_neg_add(LN2D_LO, x); let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5); let x2 = x * x; let z = z.mul_add(x2, x); let n2 = Self::vm_pow2n(r); let z = (z + Self::ONE) * n2; // check for overflow let in_range = self.abs().cmp_lt(max_x); let in_range = in_range & self.is_finite(); in_range.blend(z, Self::ZERO) } #[inline] fn exponent(self) -> f32x4 { const_f32_as_f32x4!(pow2_23, 8388608.0); const_f32_as_f32x4!(bias, 127.0); let a = cast::<_, u32x4>(self); let b = a >> 23; let c = b | cast::<_, u32x4>(pow2_23); let d = cast::<_, f32x4>(c); let e = d - (pow2_23 + bias); e } #[inline] fn fraction_2(self) -> Self { let t1 = cast::<_, u32x4>(self); let t2 = cast::<_, u32x4>( (t1 & u32x4::from(0x007FFFFF)) | u32x4::from(0x3F000000), ); cast::<_, f32x4>(t2) } #[inline] fn is_zero_or_subnormal(self) -> Self { let t = cast::<_, i32x4>(self); let t = t & i32x4::splat(0x7F800000); i32x4::round_float(t.cmp_eq(i32x4::splat(0))) } #[inline] fn infinity() -> Self { cast::<_, f32x4>(i32x4::splat(0x7F800000)) } #[inline] fn nan_log() -> Self { cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF)) } #[inline] fn nan_pow() -> Self { cast::<_, f32x4>(i32x4::splat(0x7FC00000 | 0x101 & 0x003FFFFF)) } #[inline] pub fn sign_bit(self) -> Self { let t1 = cast::<_, i32x4>(self); let t2 = t1 >> 31; !cast::<_, f32x4>(t2).cmp_eq(f32x4::ZERO) } /// horizontal add of all the elements of the vector #[inline] #[must_use] pub fn reduce_add(self) -> f32 { let arr: [f32; 4] = cast(self); arr.iter().sum() } /// Natural log (ln(x)) #[inline] #[must_use] pub fn ln(self) -> Self { const_f32_as_f32x4!(HALF, 0.5); const_f32_as_f32x4!(P0, 3.3333331174E-1); const_f32_as_f32x4!(P1, -2.4999993993E-1); const_f32_as_f32x4!(P2, 2.0000714765E-1); const_f32_as_f32x4!(P3, -1.6668057665E-1); const_f32_as_f32x4!(P4, 1.4249322787E-1); const_f32_as_f32x4!(P5, -1.2420140846E-1); const_f32_as_f32x4!(P6, 1.1676998740E-1); const_f32_as_f32x4!(P7, -1.1514610310E-1); const_f32_as_f32x4!(P8, 7.0376836292E-2); const_f32_as_f32x4!(LN2F_HI, 0.693359375); const_f32_as_f32x4!(LN2F_LO, -2.12194440e-4); const_f32_as_f32x4!(VM_SMALLEST_NORMAL, 1.17549435E-38); let x1 = self; let x = Self::fraction_2(x1); let e = Self::exponent(x1); let mask = x.cmp_gt(Self::SQRT_2 * HALF); let x = (!mask).blend(x + x, x); let fe = mask.blend(e + Self::ONE, e); let x = x - Self::ONE; let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8); let x2 = x * x; let res = x2 * x * res; let res = fe.mul_add(LN2F_LO, res); let res = res + x2.mul_neg_add(HALF, x); let res = fe.mul_add(LN2F_HI, res); let overflow = !self.is_finite(); let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL); let mask = overflow | underflow; if !mask.any() { res } else { let is_zero = self.is_zero_or_subnormal(); let res = underflow.blend(Self::nan_log(), res); let res = is_zero.blend(Self::infinity(), res); let res = overflow.blend(self, res); res } } #[inline] #[must_use] pub fn log2(self) -> Self { Self::ln(self) * Self::LOG2_E } #[inline] #[must_use] pub fn log10(self) -> Self { Self::ln(self) * Self::LOG10_E } #[inline] #[must_use] pub fn pow_f32x4(self, y: f32x4) -> Self { const_f32_as_f32x4!(ln2f_hi, 0.693359375); const_f32_as_f32x4!(ln2f_lo, -2.12194440e-4); const_f32_as_f32x4!(P0logf, 3.3333331174E-1); const_f32_as_f32x4!(P1logf, -2.4999993993E-1); const_f32_as_f32x4!(P2logf, 2.0000714765E-1); const_f32_as_f32x4!(P3logf, -1.6668057665E-1); const_f32_as_f32x4!(P4logf, 1.4249322787E-1); const_f32_as_f32x4!(P5logf, -1.2420140846E-1); const_f32_as_f32x4!(P6logf, 1.1676998740E-1); const_f32_as_f32x4!(P7logf, -1.1514610310E-1); const_f32_as_f32x4!(P8logf, 7.0376836292E-2); const_f32_as_f32x4!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp const_f32_as_f32x4!(p3expf, 1.0 / 6.0); const_f32_as_f32x4!(p4expf, 1.0 / 24.0); const_f32_as_f32x4!(p5expf, 1.0 / 120.0); const_f32_as_f32x4!(p6expf, 1.0 / 720.0); const_f32_as_f32x4!(p7expf, 1.0 / 5040.0); let x1 = self.abs(); let x = x1.fraction_2(); let mask = x.cmp_gt(f32x4::SQRT_2 * f32x4::HALF); let x = (!mask).blend(x + x, x); let x = x - f32x4::ONE; let x2 = x * x; let lg1 = polynomial_8!( x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf ); let lg1 = lg1 * x2 * x; let ef = x1.exponent(); let ef = mask.blend(ef + f32x4::ONE, ef); let e1 = (ef * y).round(); let yr = ef.mul_sub(y, e1); let lg = f32x4::HALF.mul_neg_add(x2, x) + lg1; let x2_err = (f32x4::HALF * x).mul_sub(x, f32x4::HALF * x2); let lg_err = f32x4::HALF.mul_add(x2, lg - x) - lg1; let e2 = (lg * y * f32x4::LOG2_E).round(); let v = lg.mul_sub(y, e2 * ln2f_hi); let v = e2.mul_neg_add(ln2f_lo, v); let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x4::LN_2); let x = v; let e3 = (x * f32x4::LOG2_E).round(); let x = e3.mul_neg_add(f32x4::LN_2, x); let x2 = x * x; let z = x2.mul_add( polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf), x + f32x4::ONE, ); let ee = e1 + e2 + e3; let ei = cast::<_, i32x4>(ee.round_int()); let ej = cast::<_, i32x4>(ei + (cast::<_, i32x4>(z) >> 23)); let overflow = cast::<_, f32x4>(ej.cmp_gt(i32x4::splat(0x0FF))) | (ee.cmp_gt(f32x4::splat(300.0))); let underflow = cast::<_, f32x4>(ej.cmp_lt(i32x4::splat(0x000))) | (ee.cmp_lt(f32x4::splat(-300.0))); // Add exponent by integer addition let z = cast::<_, f32x4>(cast::<_, i32x4>(z) + (ei << 23)); // Check for overflow/underflow let z = if (overflow | underflow).any() { let z = underflow.blend(f32x4::ZERO, z); overflow.blend(Self::infinity(), z) } else { z }; // Check for self == 0 let x_zero = self.is_zero_or_subnormal(); let z = x_zero.blend( y.cmp_lt(f32x4::ZERO).blend( Self::infinity(), y.cmp_eq(f32x4::ZERO).blend(f32x4::ONE, f32x4::ZERO), ), z, ); let x_sign = self.sign_bit(); let z = if x_sign.any() { // Y into an integer let yi = y.cmp_eq(y.round()); // Is y odd? let y_odd = cast::<_, i32x4>(y.round_int() << 31).round_float(); let z1 = yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow())); x_sign.blend(z1, z) } else { z }; let x_finite = self.is_finite(); let y_finite = y.is_finite(); let e_finite = ee.is_finite(); if (x_finite & y_finite & (e_finite | x_zero)).all() { return z; } (self.is_nan() | y.is_nan()).blend(self + y, z) } #[inline] pub fn powf(self, y: f32) -> Self { Self::pow_f32x4(self, f32x4::splat(y)) } #[inline] pub fn to_array(self) -> [f32; 4] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[f32; 4] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [f32; 4] { cast_mut(self) } #[inline] pub fn from_i32x4(v: i32x4) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: convert_to_m128_from_i32_m128i(v.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f32x4_convert_i32x4(v.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { Self { neon: unsafe { vcvtq_f32_s32(v.neon) }} } else { Self { arr: [ v.as_array_ref()[0] as f32, v.as_array_ref()[1] as f32, v.as_array_ref()[2] as f32, v.as_array_ref()[3] as f32, ] } } } } } wide-0.7.32/src/f32x8_.rs000066400000000000000000001161331473735473700147320ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="avx")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] pub struct f32x8 { avx: m256 } } else { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] pub struct f32x8 { a : f32x4, b : f32x4 } } } macro_rules! const_f32_as_f32x8 { ($i:ident, $f:expr) => { #[allow(non_upper_case_globals)] pub const $i: f32x8 = f32x8::new([$f; 8]); }; } impl f32x8 { const_f32_as_f32x8!(ONE, 1.0); const_f32_as_f32x8!(HALF, 0.5); const_f32_as_f32x8!(ZERO, 0.0); const_f32_as_f32x8!(E, core::f32::consts::E); const_f32_as_f32x8!(FRAC_1_PI, core::f32::consts::FRAC_1_PI); const_f32_as_f32x8!(FRAC_2_PI, core::f32::consts::FRAC_2_PI); const_f32_as_f32x8!(FRAC_2_SQRT_PI, core::f32::consts::FRAC_2_SQRT_PI); const_f32_as_f32x8!(FRAC_1_SQRT_2, core::f32::consts::FRAC_1_SQRT_2); const_f32_as_f32x8!(FRAC_PI_2, core::f32::consts::FRAC_PI_2); const_f32_as_f32x8!(FRAC_PI_3, core::f32::consts::FRAC_PI_3); const_f32_as_f32x8!(FRAC_PI_4, core::f32::consts::FRAC_PI_4); const_f32_as_f32x8!(FRAC_PI_6, core::f32::consts::FRAC_PI_6); const_f32_as_f32x8!(FRAC_PI_8, core::f32::consts::FRAC_PI_8); const_f32_as_f32x8!(LN_2, core::f32::consts::LN_2); const_f32_as_f32x8!(LN_10, core::f32::consts::LN_10); const_f32_as_f32x8!(LOG2_E, core::f32::consts::LOG2_E); const_f32_as_f32x8!(LOG10_E, core::f32::consts::LOG10_E); const_f32_as_f32x8!(LOG10_2, core::f32::consts::LOG10_2); const_f32_as_f32x8!(LOG2_10, core::f32::consts::LOG2_10); const_f32_as_f32x8!(PI, core::f32::consts::PI); const_f32_as_f32x8!(SQRT_2, core::f32::consts::SQRT_2); const_f32_as_f32x8!(TAU, core::f32::consts::TAU); } unsafe impl Zeroable for f32x8 {} unsafe impl Pod for f32x8 {} impl Add for f32x8 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: add_m256(self.avx, rhs.avx) } } else { Self { a : self.a.add(rhs.a), b : self.b.add(rhs.b), } } } } } impl Sub for f32x8 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: sub_m256(self.avx, rhs.avx) } } else { Self { a : self.a.sub(rhs.a), b : self.b.sub(rhs.b), } } } } } impl Mul for f32x8 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: mul_m256(self.avx, rhs.avx) } } else { Self { a : self.a.mul(rhs.a), b : self.b.mul(rhs.b), } } } } } impl Div for f32x8 { type Output = Self; #[inline] #[must_use] fn div(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: div_m256(self.avx, rhs.avx) } } else { Self { a : self.a.div(rhs.a), b : self.b.div(rhs.b), } } } } } impl Add for f32x8 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: f32) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for f32x8 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: f32) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for f32x8 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: f32) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Div for f32x8 { type Output = Self; #[inline] #[must_use] fn div(self, rhs: f32) -> Self::Output { self.div(Self::splat(rhs)) } } impl Add for f32 { type Output = f32x8; #[inline] #[must_use] fn add(self, rhs: f32x8) -> Self::Output { f32x8::splat(self).add(rhs) } } impl Sub for f32 { type Output = f32x8; #[inline] #[must_use] fn sub(self, rhs: f32x8) -> Self::Output { f32x8::splat(self).sub(rhs) } } impl Mul for f32 { type Output = f32x8; #[inline] #[must_use] fn mul(self, rhs: f32x8) -> Self::Output { f32x8::splat(self).mul(rhs) } } impl Div for f32 { type Output = f32x8; #[inline] #[must_use] fn div(self, rhs: f32x8) -> Self::Output { f32x8::splat(self).div(rhs) } } impl BitAnd for f32x8 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: bitand_m256(self.avx, rhs.avx) } } else { Self { a : self.a.bitand(rhs.a), b : self.b.bitand(rhs.b), } } } } } impl BitOr for f32x8 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: bitor_m256(self.avx, rhs.avx) } } else { Self { a : self.a.bitor(rhs.a), b : self.b.bitor(rhs.b), } } } } } impl BitXor for f32x8 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: bitxor_m256(self.avx, rhs.avx) } } else { Self { a : self.a.bitxor(rhs.a), b : self.b.bitxor(rhs.b), } } } } } impl CmpEq for f32x8 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: cmp_op_mask_m256::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_eq(rhs.a), b : self.b.cmp_eq(rhs.b), } } } } } impl CmpGe for f32x8 { type Output = Self; #[inline] #[must_use] fn cmp_ge(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_ge(rhs.a), b : self.b.cmp_ge(rhs.b), } } } } } impl CmpGt for f32x8 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: cmp_op_mask_m256::<{cmp_op!(GreaterThanOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_gt(rhs.a), b : self.b.cmp_gt(rhs.b), } } } } } impl CmpNe for f32x8 { type Output = Self; #[inline] #[must_use] fn cmp_ne(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: cmp_op_mask_m256::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_ne(rhs.a), b : self.b.cmp_ne(rhs.b), } } } } } impl CmpLe for f32x8 { type Output = Self; #[inline] #[must_use] fn cmp_le(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: cmp_op_mask_m256::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_le(rhs.a), b : self.b.cmp_le(rhs.b), } } } } } impl CmpLt for f32x8 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: cmp_op_mask_m256::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_lt(rhs.a), b : self.b.cmp_lt(rhs.b), } } } } } impl f32x8 { #[inline] #[must_use] pub const fn new(array: [f32; 8]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: blend_varying_m256(f.avx, t.avx, self.avx) } } else { Self { a : self.a.blend(t.a, f.a), b : self.b.blend(t.b, f.b), } } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="avx")] { let non_sign_bits = f32x8::from(f32::from_bits(i32::MAX as u32)); self & non_sign_bits } else { Self { a : self.a.abs(), b : self.b.abs(), } } } } #[inline] #[must_use] pub fn floor(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: floor_m256(self.avx) } } else { Self { a : self.a.floor(), b : self.b.floor(), } } } } #[inline] #[must_use] pub fn ceil(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: ceil_m256(self.avx) } } else { Self { a : self.a.ceil(), b : self.b.ceil(), } } } } /// Calculates the lanewise maximum of both vectors. This is a faster /// implementation than `max`, but it doesn't specify any behavior if NaNs are /// involved. #[inline] #[must_use] pub fn fast_max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: max_m256(self.avx, rhs.avx) } } else { Self { a : self.a.fast_max(rhs.a), b : self.b.fast_max(rhs.b), } } } } /// Calculates the lanewise maximum of both vectors. This doesn't match /// IEEE-754 and instead is defined as `self < rhs ? rhs : self`. #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { // max_m256 seems to do rhs < self ? self : rhs. So if there's any NaN // involved, it chooses rhs, so we need to specifically check rhs for // NaN. rhs.is_nan().blend(self, Self { avx: max_m256(self.avx, rhs.avx) }) } else { Self { a : self.a.max(rhs.a), b : self.b.max(rhs.b), } } } } /// Calculates the lanewise minimum of both vectors. This is a faster /// implementation than `min`, but it doesn't specify any behavior if NaNs are /// involved. #[inline] #[must_use] pub fn fast_min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: min_m256(self.avx, rhs.avx) } } else { Self { a : self.a.fast_min(rhs.a), b : self.b.fast_min(rhs.b), } } } } /// Calculates the lanewise minimum of both vectors. If either lane is NaN, /// the other lane gets chosen. Use `fast_min` for a faster implementation /// that doesn't handle NaNs. #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { // min_m256 seems to do rhs > self ? self : rhs. So if there's any NaN // involved, it chooses rhs, so we need to specifically check rhs for // NaN. rhs.is_nan().blend(self, Self { avx: min_m256(self.avx, rhs.avx) }) } else { Self { a : self.a.min(rhs.a), b : self.b.min(rhs.b), } } } } #[inline] #[must_use] pub fn is_nan(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: cmp_op_mask_m256::<{cmp_op!(Unordered)}>(self.avx, self.avx) } } else { Self { a : self.a.is_nan(), b : self.b.is_nan(), } } } } #[inline] #[must_use] pub fn is_finite(self) -> Self { let shifted_exp_mask = u32x8::from(0xFF000000); let u: u32x8 = cast(self); let shift_u = u << 1_u64; let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask); cast(out) } #[inline] #[must_use] pub fn is_inf(self) -> Self { let shifted_inf = u32x8::from(0xFF000000); let u: u32x8 = cast(self); let shift_u = u << 1_u64; let out = (shift_u).cmp_eq(shifted_inf); cast(out) } #[inline] #[must_use] pub fn round(self) -> Self { pick! { // NOTE: Is there an SSE2 version of this? f32x4 version probably translates but I've not had time to figure it out if #[cfg(target_feature="avx")] { Self { avx: round_m256::<{round_op!(Nearest)}>(self.avx) } } else { Self { a : self.a.round(), b : self.b.round(), } } } } /// Rounds each lane into an integer. This is a faster implementation than /// `round_int`, but it doesn't handle out of range values or NaNs. For those /// values you get implementation defined behavior. #[inline] #[must_use] pub fn fast_round_int(self) -> i32x8 { pick! { if #[cfg(target_feature="avx")] { cast(convert_to_i32_m256i_from_m256(self.avx)) } else { cast([ self.a.fast_round_int(), self.b.fast_round_int()]) } } } /// Rounds each lane into an integer. This saturates out of range values and /// turns NaNs into 0. Use `fast_round_int` for a faster implementation that /// doesn't handle out of range values or NaNs. #[inline] #[must_use] pub fn round_int(self) -> i32x8 { pick! { if #[cfg(target_feature="avx")] { // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 let non_nan_mask = self.cmp_eq(self); let non_nan = self & non_nan_mask; let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); let cast: i32x8 = cast(convert_to_i32_m256i_from_m256(non_nan.avx)); flip_to_max ^ cast } else { cast([ self.a.round_int(), self.b.round_int(), ]) } } } /// Truncates each lane into an integer. This is a faster implementation than /// `trunc_int`, but it doesn't handle out of range values or NaNs. For those /// values you get implementation defined behavior. #[inline] #[must_use] pub fn fast_trunc_int(self) -> i32x8 { pick! { if #[cfg(all(target_feature="avx"))] { cast(convert_truncate_to_i32_m256i_from_m256(self.avx)) } else { cast([ self.a.fast_trunc_int(), self.b.fast_trunc_int(), ]) } } } /// Truncates each lane into an integer. This saturates out of range values /// and turns NaNs into 0. Use `fast_trunc_int` for a faster implementation /// that doesn't handle out of range values or NaNs. #[inline] #[must_use] pub fn trunc_int(self) -> i32x8 { pick! { if #[cfg(target_feature="avx")] { // Based on: https://github.com/v8/v8/blob/210987a552a2bf2a854b0baa9588a5959ff3979d/src/codegen/shared-ia32-x64/macro-assembler-shared-ia32-x64.h#L489-L504 let non_nan_mask = self.cmp_eq(self); let non_nan = self & non_nan_mask; let flip_to_max: i32x8 = cast(self.cmp_ge(Self::splat(2147483648.0))); let cast: i32x8 = cast(convert_truncate_to_i32_m256i_from_m256(non_nan.avx)); flip_to_max ^ cast } else { cast([ self.a.trunc_int(), self.b.trunc_int(), ]) } } } #[inline] #[must_use] pub fn mul_add(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="avx",target_feature="fma"))] { Self { avx: fused_mul_add_m256(self.avx, m.avx, a.avx) } } else if #[cfg(target_feature="avx")] { // still want to use 256 bit ops (self * m) + a } else { Self { a : self.a.mul_add(m.a, a.a), b : self.b.mul_add(m.b, a.b), } } } } #[inline] #[must_use] pub fn mul_sub(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="avx",target_feature="fma"))] { Self { avx: fused_mul_sub_m256(self.avx, m.avx, a.avx) } } else if #[cfg(target_feature="avx")] { // still want to use 256 bit ops (self * m) - a } else { Self { a : self.a.mul_sub(m.a, a.a), b : self.b.mul_sub(m.b, a.b), } } } } #[inline] #[must_use] pub fn mul_neg_add(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="avx",target_feature="fma"))] { Self { avx: fused_mul_neg_add_m256(self.avx, m.avx, a.avx) } } else if #[cfg(target_feature="avx")] { // still want to use 256 bit ops a - (self * m) } else { Self { a : self.a.mul_neg_add(m.a, a.a), b : self.b.mul_neg_add(m.b, a.b), } } } } #[inline] #[must_use] pub fn mul_neg_sub(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="avx",target_feature="fma"))] { Self { avx: fused_mul_neg_sub_m256(self.avx, m.avx, a.avx) } } else if #[cfg(target_feature="avx")] { // still want to use 256 bit ops -(self * m) - a } else { Self { a : self.a.mul_neg_sub(m.a, a.a), b : self.b.mul_neg_sub(m.b, a.b), } } } } #[inline] #[must_use] pub fn flip_signs(self, signs: Self) -> Self { self ^ (signs & Self::from(-0.0)) } #[inline] #[must_use] pub fn copysign(self, sign: Self) -> Self { let magnitude_mask = Self::from(f32::from_bits(u32::MAX >> 1)); (self & magnitude_mask) | (sign & Self::from(-0.0)) } #[inline] pub fn asin_acos(self) -> (Self, Self) { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x8!(P4asinf, 4.2163199048E-2); const_f32_as_f32x8!(P3asinf, 2.4181311049E-2); const_f32_as_f32x8!(P2asinf, 4.5470025998E-2); const_f32_as_f32x8!(P1asinf, 7.4953002686E-2); const_f32_as_f32x8!(P0asinf, 1.6666752422E-1); let xa = self.abs(); let big = xa.cmp_ge(f32x8::splat(0.5)); let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa); let x2 = xa * xa; let x3 = big.blend(x1, x2); let xb = x1.sqrt(); let x4 = big.blend(xb, xa); let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); let z = z.mul_add(x3 * x4, x4); let z1 = z + z; // acos let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1); let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self); let acos = big.blend(z3, z4); // asin let z3 = f32x8::FRAC_PI_2 - z1; let asin = big.blend(z3, z); let asin = asin.flip_signs(self); (asin, acos) } #[inline] #[must_use] pub fn asin(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x8!(P4asinf, 4.2163199048E-2); const_f32_as_f32x8!(P3asinf, 2.4181311049E-2); const_f32_as_f32x8!(P2asinf, 4.5470025998E-2); const_f32_as_f32x8!(P1asinf, 7.4953002686E-2); const_f32_as_f32x8!(P0asinf, 1.6666752422E-1); let xa = self.abs(); let big = xa.cmp_ge(f32x8::splat(0.5)); let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa); let x2 = xa * xa; let x3 = big.blend(x1, x2); let xb = x1.sqrt(); let x4 = big.blend(xb, xa); let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); let z = z.mul_add(x3 * x4, x4); let z1 = z + z; // asin let z3 = f32x8::FRAC_PI_2 - z1; let asin = big.blend(z3, z); let asin = asin.flip_signs(self); asin } #[inline] #[must_use] pub fn acos(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x8!(P4asinf, 4.2163199048E-2); const_f32_as_f32x8!(P3asinf, 2.4181311049E-2); const_f32_as_f32x8!(P2asinf, 4.5470025998E-2); const_f32_as_f32x8!(P1asinf, 7.4953002686E-2); const_f32_as_f32x8!(P0asinf, 1.6666752422E-1); let xa = self.abs(); let big = xa.cmp_ge(f32x8::splat(0.5)); let x1 = f32x8::splat(0.5) * (f32x8::ONE - xa); let x2 = xa * xa; let x3 = big.blend(x1, x2); let xb = x1.sqrt(); let x4 = big.blend(xb, xa); let z = polynomial_4!(x3, P0asinf, P1asinf, P2asinf, P3asinf, P4asinf); let z = z.mul_add(x3 * x4, x4); let z1 = z + z; // acos let z3 = self.cmp_lt(f32x8::ZERO).blend(f32x8::PI - z1, z1); let z4 = f32x8::FRAC_PI_2 - z.flip_signs(self); let acos = big.blend(z3, z4); acos } #[inline] pub fn atan(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x8!(P3atanf, 8.05374449538E-2); const_f32_as_f32x8!(P2atanf, -1.38776856032E-1); const_f32_as_f32x8!(P1atanf, 1.99777106478E-1); const_f32_as_f32x8!(P0atanf, -3.33329491539E-1); let t = self.abs(); // small: z = t / 1.0; // medium: z = (t-1.0) / (t+1.0); // big: z = -1.0 / t; let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE); let notbig = t.cmp_le(Self::SQRT_2 + Self::ONE); let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2); s = notsmal & s; let mut a = notbig & t; a = notsmal.blend(a - Self::ONE, a); let mut b = notbig & Self::ONE; b = notsmal.blend(b + t, b); let z = a / b; let zz = z * z; // Taylor expansion let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf); re = re.mul_add(zz * z, z) + s; // get sign bit re = (self.sign_bit()).blend(-re, re); re } #[inline] pub fn atan2(self, x: Self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x8!(P3atanf, 8.05374449538E-2); const_f32_as_f32x8!(P2atanf, -1.38776856032E-1); const_f32_as_f32x8!(P1atanf, 1.99777106478E-1); const_f32_as_f32x8!(P0atanf, -3.33329491539E-1); let y = self; // move in first octant let x1 = x.abs(); let y1 = y.abs(); let swapxy = y1.cmp_gt(x1); // swap x and y if y1 > x1 let mut x2 = swapxy.blend(y1, x1); let mut y2 = swapxy.blend(x1, y1); // check for special case: x and y are both +/- INF let both_infinite = x.is_inf() & y.is_inf(); if both_infinite.any() { let minus_one = -Self::ONE; x2 = both_infinite.blend(x2 & minus_one, x2); y2 = both_infinite.blend(y2 & minus_one, y2); } // x = y = 0 will produce NAN. No problem, fixed below let t = y2 / x2; // small: z = t / 1.0; // medium: z = (t-1.0) / (t+1.0); let notsmal = t.cmp_ge(Self::SQRT_2 - Self::ONE); let a = notsmal.blend(t - Self::ONE, t); let b = notsmal.blend(t + Self::ONE, Self::ONE); let s = notsmal & Self::FRAC_PI_4; let z = a / b; let zz = z * z; // Taylor expansion let mut re = polynomial_3!(zz, P0atanf, P1atanf, P2atanf, P3atanf); re = re.mul_add(zz * z, z) + s; // move back in place re = swapxy.blend(Self::FRAC_PI_2 - re, re); re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re); re = (x.sign_bit()).blend(Self::PI - re, re); // get sign bit re = (y.sign_bit()).blend(-re, re); re } #[inline] #[must_use] pub fn sin_cos(self) -> (Self, Self) { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f32_as_f32x8!(DP1F, 0.78515625_f32 * 2.0); const_f32_as_f32x8!(DP2F, 2.4187564849853515625E-4_f32 * 2.0); const_f32_as_f32x8!(DP3F, 3.77489497744594108E-8_f32 * 2.0); const_f32_as_f32x8!(P0sinf, -1.6666654611E-1); const_f32_as_f32x8!(P1sinf, 8.3321608736E-3); const_f32_as_f32x8!(P2sinf, -1.9515295891E-4); const_f32_as_f32x8!(P0cosf, 4.166664568298827E-2); const_f32_as_f32x8!(P1cosf, -1.388731625493765E-3); const_f32_as_f32x8!(P2cosf, 2.443315711809948E-5); const_f32_as_f32x8!(TWO_OVER_PI, 2.0 / core::f32::consts::PI); let xa = self.abs(); // Find quadrant let y = (xa * TWO_OVER_PI).round(); let q: i32x8 = y.round_int(); let x = y.mul_neg_add(DP3F, y.mul_neg_add(DP2F, y.mul_neg_add(DP1F, xa))); let x2 = x * x; let mut s = polynomial_2!(x2, P0sinf, P1sinf, P2sinf) * (x * x2) + x; let mut c = polynomial_2!(x2, P0cosf, P1cosf, P2cosf) * (x2 * x2) + f32x8::from(0.5).mul_neg_add(x2, f32x8::from(1.0)); let swap = !(q & i32x8::from(1)).cmp_eq(i32x8::from(0)); let mut overflow: f32x8 = cast(q.cmp_gt(i32x8::from(0x2000000))); overflow &= xa.is_finite(); s = overflow.blend(f32x8::from(0.0), s); c = overflow.blend(f32x8::from(1.0), c); // calc sin let mut sin1 = cast::<_, f32x8>(swap).blend(c, s); let sign_sin: i32x8 = (q << 30) ^ cast::<_, i32x8>(self); sin1 = sin1.flip_signs(cast(sign_sin)); // calc cos let mut cos1 = cast::<_, f32x8>(swap).blend(s, c); let sign_cos: i32x8 = ((q + i32x8::from(1)) & i32x8::from(2)) << 30; cos1 ^= cast::<_, f32x8>(sign_cos); (sin1, cos1) } #[inline] #[must_use] pub fn sin(self) -> Self { let (s, _) = self.sin_cos(); s } #[inline] #[must_use] pub fn cos(self) -> Self { let (_, c) = self.sin_cos(); c } #[inline] #[must_use] pub fn tan(self) -> Self { let (s, c) = self.sin_cos(); s / c } #[inline] #[must_use] pub fn to_degrees(self) -> Self { const_f32_as_f32x8!(RAD_TO_DEG_RATIO, 180.0_f32 / core::f32::consts::PI); self * RAD_TO_DEG_RATIO } #[inline] #[must_use] pub fn to_radians(self) -> Self { const_f32_as_f32x8!(DEG_TO_RAD_RATIO, core::f32::consts::PI / 180.0_f32); self * DEG_TO_RAD_RATIO } #[inline] #[must_use] pub fn recip(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: reciprocal_m256(self.avx) } } else { Self { a : self.a.recip(), b : self.b.recip(), } } } } #[inline] #[must_use] pub fn recip_sqrt(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: reciprocal_sqrt_m256(self.avx) } } else { Self { a : self.a.recip_sqrt(), b : self.b.recip_sqrt(), } } } } #[inline] #[must_use] pub fn sqrt(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: sqrt_m256(self.avx) } } else { Self { a : self.a.sqrt(), b : self.b.sqrt(), } } } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="avx")] { move_mask_m256(self.avx) } else { (self.b.move_mask() << 4) | self.a.move_mask() } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="avx")] { move_mask_m256(self.avx) != 0 } else { self.a.any() || self.b.any() } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="avx")] { move_mask_m256(self.avx) == 0b11111111 } else { self.a.all() && self.b.all() } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] fn vm_pow2n(self) -> Self { const_f32_as_f32x8!(pow2_23, 8388608.0); const_f32_as_f32x8!(bias, 127.0); let a = self + (bias + pow2_23); let c = cast::<_, i32x8>(a) << 23; cast::<_, f32x8>(c) } /// Calculate the exponent of a packed `f32x8` #[inline] #[must_use] pub fn exp(self) -> Self { const_f32_as_f32x8!(P0, 1.0 / 2.0); const_f32_as_f32x8!(P1, 1.0 / 6.0); const_f32_as_f32x8!(P2, 1. / 24.); const_f32_as_f32x8!(P3, 1. / 120.); const_f32_as_f32x8!(P4, 1. / 720.); const_f32_as_f32x8!(P5, 1. / 5040.); const_f32_as_f32x8!(LN2D_HI, 0.693359375); const_f32_as_f32x8!(LN2D_LO, -2.12194440e-4); let max_x = f32x8::from(87.3); let r = (self * Self::LOG2_E).round(); let x = r.mul_neg_add(LN2D_HI, self); let x = r.mul_neg_add(LN2D_LO, x); let z = polynomial_5!(x, P0, P1, P2, P3, P4, P5); let x2 = x * x; let z = z.mul_add(x2, x); let n2 = Self::vm_pow2n(r); let z = (z + Self::ONE) * n2; // check for overflow let in_range = self.abs().cmp_lt(max_x); let in_range = in_range & self.is_finite(); in_range.blend(z, Self::ZERO) } #[inline] fn exponent(self) -> f32x8 { const_f32_as_f32x8!(pow2_23, 8388608.0); const_f32_as_f32x8!(bias, 127.0); let a = cast::<_, u32x8>(self); let b = a >> 23; let c = b | cast::<_, u32x8>(pow2_23); let d = cast::<_, f32x8>(c); let e = d - (pow2_23 + bias); e } #[inline] fn fraction_2(self) -> Self { let t1 = cast::<_, u32x8>(self); let t2 = cast::<_, u32x8>( (t1 & u32x8::from(0x007FFFFF)) | u32x8::from(0x3F000000), ); cast::<_, f32x8>(t2) } #[inline] fn is_zero_or_subnormal(self) -> Self { let t = cast::<_, i32x8>(self); let t = t & i32x8::splat(0x7F800000); i32x8::round_float(t.cmp_eq(i32x8::splat(0))) } #[inline] fn infinity() -> Self { cast::<_, f32x8>(i32x8::splat(0x7F800000)) } #[inline] fn nan_log() -> Self { cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF)) } #[inline] fn nan_pow() -> Self { cast::<_, f32x8>(i32x8::splat(0x7FC00000 | 0x101 & 0x003FFFFF)) } #[inline] pub fn sign_bit(self) -> Self { let t1 = cast::<_, i32x8>(self); let t2 = t1 >> 31; !cast::<_, f32x8>(t2).cmp_eq(f32x8::ZERO) } /// horizontal add of all the elements of the vector #[inline] #[must_use] pub fn reduce_add(self) -> f32 { pick! { // From https://stackoverflow.com/questions/13219146/how-to-sum-m256-horizontally if #[cfg(target_feature="avx")]{ let hi_quad = extract_m128_from_m256::<1>(self.avx); let lo_quad = cast_to_m128_from_m256(self.avx); let sum_quad = add_m128(lo_quad,hi_quad); let lo_dual = sum_quad; let hi_dual = move_high_low_m128(sum_quad,sum_quad); let sum_dual = add_m128(lo_dual,hi_dual); let lo = sum_dual; let hi = shuffle_abi_f32_all_m128::<0b_01>(sum_dual, sum_dual); let sum = add_m128_s(lo, hi); get_f32_from_m128_s(sum) } else { self.a.reduce_add() + self.b.reduce_add() } } } /// Natural log (ln(x)) #[inline] #[must_use] pub fn ln(self) -> Self { const_f32_as_f32x8!(HALF, 0.5); const_f32_as_f32x8!(P0, 3.3333331174E-1); const_f32_as_f32x8!(P1, -2.4999993993E-1); const_f32_as_f32x8!(P2, 2.0000714765E-1); const_f32_as_f32x8!(P3, -1.6668057665E-1); const_f32_as_f32x8!(P4, 1.4249322787E-1); const_f32_as_f32x8!(P5, -1.2420140846E-1); const_f32_as_f32x8!(P6, 1.1676998740E-1); const_f32_as_f32x8!(P7, -1.1514610310E-1); const_f32_as_f32x8!(P8, 7.0376836292E-2); const_f32_as_f32x8!(LN2F_HI, 0.693359375); const_f32_as_f32x8!(LN2F_LO, -2.12194440e-4); const_f32_as_f32x8!(VM_SMALLEST_NORMAL, 1.17549435E-38); let x1 = self; let x = Self::fraction_2(x1); let e = Self::exponent(x1); let mask = x.cmp_gt(Self::SQRT_2 * HALF); let x = (!mask).blend(x + x, x); let fe = mask.blend(e + Self::ONE, e); let x = x - Self::ONE; let res = polynomial_8!(x, P0, P1, P2, P3, P4, P5, P6, P7, P8); let x2 = x * x; let res = x2 * x * res; let res = fe.mul_add(LN2F_LO, res); let res = res + x2.mul_neg_add(HALF, x); let res = fe.mul_add(LN2F_HI, res); let overflow = !self.is_finite(); let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL); let mask = overflow | underflow; if !mask.any() { res } else { let is_zero = self.is_zero_or_subnormal(); let res = underflow.blend(Self::nan_log(), res); let res = is_zero.blend(Self::infinity(), res); let res = overflow.blend(self, res); res } } #[inline] #[must_use] pub fn log2(self) -> Self { Self::ln(self) * Self::LOG2_E } #[inline] #[must_use] pub fn log10(self) -> Self { Self::ln(self) * Self::LOG10_E } #[inline] #[must_use] pub fn pow_f32x8(self, y: Self) -> Self { const_f32_as_f32x8!(ln2f_hi, 0.693359375); const_f32_as_f32x8!(ln2f_lo, -2.12194440e-4); const_f32_as_f32x8!(P0logf, 3.3333331174E-1); const_f32_as_f32x8!(P1logf, -2.4999993993E-1); const_f32_as_f32x8!(P2logf, 2.0000714765E-1); const_f32_as_f32x8!(P3logf, -1.6668057665E-1); const_f32_as_f32x8!(P4logf, 1.4249322787E-1); const_f32_as_f32x8!(P5logf, -1.2420140846E-1); const_f32_as_f32x8!(P6logf, 1.1676998740E-1); const_f32_as_f32x8!(P7logf, -1.1514610310E-1); const_f32_as_f32x8!(P8logf, 7.0376836292E-2); const_f32_as_f32x8!(p2expf, 1.0 / 2.0); // coefficients for Taylor expansion of exp const_f32_as_f32x8!(p3expf, 1.0 / 6.0); const_f32_as_f32x8!(p4expf, 1.0 / 24.0); const_f32_as_f32x8!(p5expf, 1.0 / 120.0); const_f32_as_f32x8!(p6expf, 1.0 / 720.0); const_f32_as_f32x8!(p7expf, 1.0 / 5040.0); let x1 = self.abs(); let x = x1.fraction_2(); let mask = x.cmp_gt(f32x8::SQRT_2 * f32x8::HALF); let x = (!mask).blend(x + x, x); let x = x - f32x8::ONE; let x2 = x * x; let lg1 = polynomial_8!( x, P0logf, P1logf, P2logf, P3logf, P4logf, P5logf, P6logf, P7logf, P8logf ); let lg1 = lg1 * x2 * x; let ef = x1.exponent(); let ef = mask.blend(ef + f32x8::ONE, ef); let e1 = (ef * y).round(); let yr = ef.mul_sub(y, e1); let lg = f32x8::HALF.mul_neg_add(x2, x) + lg1; let x2_err = (f32x8::HALF * x).mul_sub(x, f32x8::HALF * x2); let lg_err = f32x8::HALF.mul_add(x2, lg - x) - lg1; let e2 = (lg * y * f32x8::LOG2_E).round(); let v = lg.mul_sub(y, e2 * ln2f_hi); let v = e2.mul_neg_add(ln2f_lo, v); let v = v - (lg_err + x2_err).mul_sub(y, yr * f32x8::LN_2); let x = v; let e3 = (x * f32x8::LOG2_E).round(); let x = e3.mul_neg_add(f32x8::LN_2, x); let x2 = x * x; let z = x2.mul_add( polynomial_5!(x, p2expf, p3expf, p4expf, p5expf, p6expf, p7expf), x + f32x8::ONE, ); let ee = e1 + e2 + e3; let ei = cast::<_, i32x8>(ee.round_int()); let ej = cast::<_, i32x8>(ei + (cast::<_, i32x8>(z) >> 23)); let overflow = cast::<_, f32x8>(ej.cmp_gt(i32x8::splat(0x0FF))) | (ee.cmp_gt(f32x8::splat(300.0))); let underflow = cast::<_, f32x8>(ej.cmp_lt(i32x8::splat(0x000))) | (ee.cmp_lt(f32x8::splat(-300.0))); // Add exponent by integer addition let z = cast::<_, f32x8>(cast::<_, i32x8>(z) + (ei << 23)); // Check for overflow/underflow let z = underflow.blend(f32x8::ZERO, z); let z = overflow.blend(Self::infinity(), z); // Check for self == 0 let x_zero = self.is_zero_or_subnormal(); let z = x_zero.blend( y.cmp_lt(f32x8::ZERO).blend( Self::infinity(), y.cmp_eq(f32x8::ZERO).blend(f32x8::ONE, f32x8::ZERO), ), z, ); let x_sign = self.sign_bit(); let z = if x_sign.any() { // Y into an integer let yi = y.cmp_eq(y.round()); // Is y odd? let y_odd = cast::<_, i32x8>(y.round_int() << 31).round_float(); let z1 = yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow())); x_sign.blend(z1, z) } else { z }; let x_finite = self.is_finite(); let y_finite = y.is_finite(); let e_finite = ee.is_finite(); if (x_finite & y_finite & (e_finite | x_zero)).all() { return z; } (self.is_nan() | y.is_nan()).blend(self + y, z) } #[inline] pub fn powf(self, y: f32) -> Self { Self::pow_f32x8(self, f32x8::splat(y)) } /// Transpose matrix of 8x8 `f32` matrix. Currently only accelerated on AVX. #[must_use] #[inline] pub fn transpose(data: [f32x8; 8]) -> [f32x8; 8] { pick! { if #[cfg(target_feature="avx")] { let a0 = unpack_lo_m256(data[0].avx, data[1].avx); let a1 = unpack_hi_m256(data[0].avx, data[1].avx); let a2 = unpack_lo_m256(data[2].avx, data[3].avx); let a3 = unpack_hi_m256(data[2].avx, data[3].avx); let a4 = unpack_lo_m256(data[4].avx, data[5].avx); let a5 = unpack_hi_m256(data[4].avx, data[5].avx); let a6 = unpack_lo_m256(data[6].avx, data[7].avx); let a7 = unpack_hi_m256(data[6].avx, data[7].avx); pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 { (z << 6) | (y << 4) | (x << 2) | w } const SHUFF_LO : i32 = mm_shuffle(1,0,1,0); const SHUFF_HI : i32 = mm_shuffle(3,2,3,2); // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure // (since blend runs on a different port than shuffle) let b0 = shuffle_m256::(a0,a2); let b1 = shuffle_m256::(a0,a2); let b2 = shuffle_m256::(a1,a3); let b3 = shuffle_m256::(a1,a3); let b4 = shuffle_m256::(a4,a6); let b5 = shuffle_m256::(a4,a6); let b6 = shuffle_m256::(a5,a7); let b7 = shuffle_m256::(a5,a7); [ f32x8 { avx: permute2z_m256::<0x20>(b0, b4) }, f32x8 { avx: permute2z_m256::<0x20>(b1, b5) }, f32x8 { avx: permute2z_m256::<0x20>(b2, b6) }, f32x8 { avx: permute2z_m256::<0x20>(b3, b7) }, f32x8 { avx: permute2z_m256::<0x31>(b0, b4) }, f32x8 { avx: permute2z_m256::<0x31>(b1, b5) }, f32x8 { avx: permute2z_m256::<0x31>(b2, b6) }, f32x8 { avx: permute2z_m256::<0x31>(b3, b7) } ] } else { // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here #[inline(always)] fn transpose_column(data: &[f32x8; 8], index: usize) -> f32x8 { f32x8::new([ data[0].as_array_ref()[index], data[1].as_array_ref()[index], data[2].as_array_ref()[index], data[3].as_array_ref()[index], data[4].as_array_ref()[index], data[5].as_array_ref()[index], data[6].as_array_ref()[index], data[7].as_array_ref()[index], ]) } [ transpose_column(&data, 0), transpose_column(&data, 1), transpose_column(&data, 2), transpose_column(&data, 3), transpose_column(&data, 4), transpose_column(&data, 5), transpose_column(&data, 6), transpose_column(&data, 7), ] } } } #[inline] pub fn to_array(self) -> [f32; 8] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[f32; 8] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [f32; 8] { cast_mut(self) } #[inline] pub fn from_i32x8(v: i32x8) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx: convert_to_m256_from_i32_m256i(v.avx2) } } else { Self::new([ v.as_array_ref()[0] as f32, v.as_array_ref()[1] as f32, v.as_array_ref()[2] as f32, v.as_array_ref()[3] as f32, v.as_array_ref()[4] as f32, v.as_array_ref()[5] as f32, v.as_array_ref()[6] as f32, v.as_array_ref()[7] as f32, ]) } } } } impl Not for f32x8 { type Output = Self; #[inline] fn not(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: self.avx.not() } } else { Self { a : self.a.not(), b : self.b.not(), } } } } } wide-0.7.32/src/f64x2_.rs000066400000000000000000001466501473735473700147400ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(16))] pub struct f64x2 { pub(crate) sse: m128d } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct f64x2 { pub(crate) simd: v128 } impl Default for f64x2 { fn default() -> Self { Self::splat(0.0) } } impl PartialEq for f64x2 { fn eq(&self, other: &Self) -> bool { u64x2_all_true(f64x2_eq(self.simd, other.simd)) } } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct f64x2 { pub(crate) neon: float64x2_t } impl Default for f64x2 { #[inline] #[must_use] fn default() -> Self { unsafe { Self { neon: vdupq_n_f64(0.0)} } } } impl PartialEq for f64x2 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { let e = vceqq_f64(self.neon, other.neon); vgetq_lane_u64(e,0) == u64::MAX && vgetq_lane_u64(e,1) == u64::MAX } } } } else { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(16))] pub struct f64x2 { pub(crate) arr: [f64;2] } } } macro_rules! const_f64_as_f64x2 { ($i:ident, $f:expr) => { #[allow(non_upper_case_globals)] pub const $i: f64x2 = f64x2::new([$f; 2]); }; } impl f64x2 { const_f64_as_f64x2!(ONE, 1.0); const_f64_as_f64x2!(ZERO, 0.0); const_f64_as_f64x2!(HALF, 0.5); const_f64_as_f64x2!(E, core::f64::consts::E); const_f64_as_f64x2!(FRAC_1_PI, core::f64::consts::FRAC_1_PI); const_f64_as_f64x2!(FRAC_2_PI, core::f64::consts::FRAC_2_PI); const_f64_as_f64x2!(FRAC_2_SQRT_PI, core::f64::consts::FRAC_2_SQRT_PI); const_f64_as_f64x2!(FRAC_1_SQRT_2, core::f64::consts::FRAC_1_SQRT_2); const_f64_as_f64x2!(FRAC_PI_2, core::f64::consts::FRAC_PI_2); const_f64_as_f64x2!(FRAC_PI_3, core::f64::consts::FRAC_PI_3); const_f64_as_f64x2!(FRAC_PI_4, core::f64::consts::FRAC_PI_4); const_f64_as_f64x2!(FRAC_PI_6, core::f64::consts::FRAC_PI_6); const_f64_as_f64x2!(FRAC_PI_8, core::f64::consts::FRAC_PI_8); const_f64_as_f64x2!(LN_2, core::f64::consts::LN_2); const_f64_as_f64x2!(LN_10, core::f64::consts::LN_10); const_f64_as_f64x2!(LOG2_E, core::f64::consts::LOG2_E); const_f64_as_f64x2!(LOG10_E, core::f64::consts::LOG10_E); const_f64_as_f64x2!(LOG10_2, core::f64::consts::LOG10_2); const_f64_as_f64x2!(LOG2_10, core::f64::consts::LOG2_10); const_f64_as_f64x2!(PI, core::f64::consts::PI); const_f64_as_f64x2!(SQRT_2, core::f64::consts::SQRT_2); const_f64_as_f64x2!(TAU, core::f64::consts::TAU); } unsafe impl Zeroable for f64x2 {} unsafe impl Pod for f64x2 {} impl Add for f64x2 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_f64(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0] + rhs.arr[0], self.arr[1] + rhs.arr[1], ]} } } } } impl Sub for f64x2 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vsubq_f64(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0] - rhs.arr[0], self.arr[1] - rhs.arr[1], ]} } } } } impl Mul for f64x2 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: mul_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_mul(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmulq_f64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0] * rhs.arr[0], self.arr[1] * rhs.arr[1], ]} } } } } impl Div for f64x2 { type Output = Self; #[inline] #[must_use] fn div(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: div_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_div(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vdivq_f64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0] / rhs.arr[0], self.arr[1] / rhs.arr[1], ]} } } } } impl Add for f64x2 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: f64) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for f64x2 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: f64) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for f64x2 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: f64) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Div for f64x2 { type Output = Self; #[inline] #[must_use] fn div(self, rhs: f64) -> Self::Output { self.div(Self::splat(rhs)) } } impl Add for f64 { type Output = f64x2; #[inline] #[must_use] fn add(self, rhs: f64x2) -> Self::Output { f64x2::splat(self).add(rhs) } } impl Sub for f64 { type Output = f64x2; #[inline] #[must_use] fn sub(self, rhs: f64x2) -> Self::Output { f64x2::splat(self).sub(rhs) } } impl Mul for f64 { type Output = f64x2; #[inline] #[must_use] fn mul(self, rhs: f64x2) -> Self::Output { f64x2::splat(self).mul(rhs) } } impl Div for f64 { type Output = f64x2; #[inline] #[must_use] fn div(self, rhs: f64x2) -> Self::Output { f64x2::splat(self).div(rhs) } } impl BitAnd for f64x2 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitand_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(self.neon), vreinterpretq_u64_f64(rhs.neon))) }} } else { Self { arr: [ f64::from_bits(self.arr[0].to_bits() & rhs.arr[0].to_bits()), f64::from_bits(self.arr[1].to_bits() & rhs.arr[1].to_bits()), ]} } } } } impl BitOr for f64x2 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitor_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(self.neon), vreinterpretq_u64_f64(rhs.neon))) }} } else { Self { arr: [ f64::from_bits(self.arr[0].to_bits() | rhs.arr[0].to_bits()), f64::from_bits(self.arr[1].to_bits() | rhs.arr[1].to_bits()), ]} } } } } impl BitXor for f64x2 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitxor_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(self.neon), vreinterpretq_u64_f64(rhs.neon))) }} } else { Self { arr: [ f64::from_bits(self.arr[0].to_bits() ^ rhs.arr[0].to_bits()), f64::from_bits(self.arr[1].to_bits() ^ rhs.arr[1].to_bits()), ]} } } } } impl CmpEq for f64x2 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_eq_mask_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(vceqq_f64(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] == rhs.arr[0] { f64::from_bits(u64::MAX) } else { 0.0 }, if self.arr[1] == rhs.arr[1] { f64::from_bits(u64::MAX) } else { 0.0 }, ]} } } } } impl CmpGe for f64x2 { type Output = Self; #[inline] #[must_use] fn cmp_ge(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_ge_mask_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_ge(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(vcgeq_f64(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] >= rhs.arr[0] { f64::from_bits(u64::MAX) } else { 0.0 }, if self.arr[1] >= rhs.arr[1] { f64::from_bits(u64::MAX) } else { 0.0 }, ]} } } } } impl CmpGt for f64x2 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { sse: cmp_op_mask_m128d::<{cmp_op!(GreaterThanOrdered)}>(self.sse, rhs.sse) } } else if #[cfg(target_feature="sse2")] { Self { sse: cmp_gt_mask_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_gt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(vcgtq_f64(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] > rhs.arr[0] { f64::from_bits(u64::MAX) } else { 0.0 }, if self.arr[1] > rhs.arr[1] { f64::from_bits(u64::MAX) } else { 0.0 }, ]} } } } } impl CmpNe for f64x2 { type Output = Self; #[inline] #[must_use] fn cmp_ne(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_neq_mask_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_ne(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(vceqq_f64(self.neon, rhs.neon)) }.not() } } else { Self { arr: [ if self.arr[0] != rhs.arr[0] { f64::from_bits(u64::MAX) } else { 0.0 }, if self.arr[1] != rhs.arr[1] { f64::from_bits(u64::MAX) } else { 0.0 }, ]} } } } } impl CmpLe for f64x2 { type Output = Self; #[inline] #[must_use] fn cmp_le(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_le_mask_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_le(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(vcleq_f64(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] <= rhs.arr[0] { f64::from_bits(u64::MAX) } else { 0.0 }, if self.arr[1] <= rhs.arr[1] { f64::from_bits(u64::MAX) } else { 0.0 }, ]} } } } } impl CmpLt for f64x2 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_lt_mask_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_lt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(vcltq_f64(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] < rhs.arr[0] { f64::from_bits(u64::MAX) } else { 0.0 }, if self.arr[1] < rhs.arr[1] { f64::from_bits(u64::MAX) } else { 0.0 }, ]} } } } } impl f64x2 { #[inline] #[must_use] pub const fn new(array: [f64; 2]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_m128d(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else { generic_bit_blend(self, t, f) } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="simd128")] { Self { simd: f64x2_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vabsq_f64(self.neon) }} } else { let non_sign_bits = f64x2::from(f64::from_bits(i64::MAX as u64)); self & non_sign_bits } } } #[inline] #[must_use] pub fn floor(self) -> Self { pick! { if #[cfg(target_feature="simd128")] { Self { simd: f64x2_floor(self.simd) } } else if #[cfg(target_feature="sse4.1")] { Self { sse: floor_m128d(self.sse) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vrndmq_f64(self.neon) }} } else if #[cfg(feature="std")] { let base: [f64; 2] = cast(self); cast(base.map(|val| val.floor())) } else { let base: [f64; 2] = cast(self); let rounded: [f64; 2] = cast(self.round()); cast([ if base[0] < rounded[0] { rounded[0] - 1.0 } else { rounded[0] }, if base[1] < rounded[1] { rounded[1] - 1.0 } else { rounded[1] }, ]) } } } #[inline] #[must_use] pub fn ceil(self) -> Self { pick! { if #[cfg(target_feature="simd128")] { Self { simd: f64x2_ceil(self.simd) } } else if #[cfg(target_feature="sse4.1")] { Self { sse: ceil_m128d(self.sse) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vrndpq_f64(self.neon) }} } else if #[cfg(feature="std")] { let base: [f64; 2] = cast(self); cast(base.map(|val| val.ceil())) } else { let base: [f64; 2] = cast(self); let rounded: [f64; 2] = cast(self.round()); cast([ if base[0] > rounded[0] { rounded[0] + 1.0 } else { rounded[0] }, if base[1] > rounded[1] { rounded[1] + 1.0 } else { rounded[1] }, ]) } } } /// Calculates the lanewise maximum of both vectors. This is a faster /// implementation than `max`, but it doesn't specify any behavior if NaNs are /// involved. #[inline] #[must_use] pub fn fast_max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: max_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_pmax(self.simd, rhs.simd), } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxq_f64(self.neon, rhs.neon) }} } else { Self { arr: [ if self.arr[0] < rhs.arr[0] { rhs.arr[0] } else { self.arr[0] }, if self.arr[1] < rhs.arr[1] { rhs.arr[1] } else { self.arr[1] }, ]} } } } /// Calculates the lanewise maximum of both vectors. If either lane is NaN, /// the other lane gets chosen. Use `fast_max` for a faster implementation /// that doesn't handle NaNs. #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { // max_m128d seems to do rhs < self ? self : rhs. So if there's any NaN // involved, it chooses rhs, so we need to specifically check rhs for // NaN. rhs.is_nan().blend(self, Self { sse: max_m128d(self.sse, rhs.sse) }) } else if #[cfg(target_feature="simd128")] { // WASM has two max intrinsics: // - max: This propagates NaN, that's the opposite of what we need. // - pmax: This is defined as self < rhs ? rhs : self, which basically // chooses self if either is NaN. // // pmax is what we want, but we need to specifically check self for NaN. Self { simd: v128_bitselect( rhs.simd, f64x2_pmax(self.simd, rhs.simd), f64x2_ne(self.simd, self.simd), // NaN check ) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxnmq_f64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].max(rhs.arr[0]), self.arr[1].max(rhs.arr[1]), ]} } } } /// Calculates the lanewise minimum of both vectors. This is a faster /// implementation than `min`, but it doesn't specify any behavior if NaNs are /// involved. #[inline] #[must_use] pub fn fast_min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: min_m128d(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_pmin(self.simd, rhs.simd), } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminq_f64(self.neon, rhs.neon) }} } else { Self { arr: [ if self.arr[0] < rhs.arr[0] { self.arr[0] } else { rhs.arr[0] }, if self.arr[1] < rhs.arr[1] { self.arr[1] } else { rhs.arr[1] }, ]} } } } /// Calculates the lanewise minimum of both vectors. If either lane is NaN, /// the other lane gets chosen. Use `fast_min` for a faster implementation /// that doesn't handle NaNs. #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { // min_m128d seems to do rhs < self ? rhs : self. So if there's any NaN // involved, it chooses rhs, so we need to specifically check rhs for // NaN. rhs.is_nan().blend(self, Self { sse: min_m128d(self.sse, rhs.sse) }) } else if #[cfg(target_feature="simd128")] { // WASM has two min intrinsics: // - min: This propagates NaN, that's the opposite of what we need. // - pmin: This is defined as rhs < self ? rhs : self, which basically // chooses self if either is NaN. // // pmin is what we want, but we need to specifically check self for NaN. Self { simd: v128_bitselect( rhs.simd, f64x2_pmin(self.simd, rhs.simd), f64x2_ne(self.simd, self.simd), // NaN check ) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminnmq_f64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].min(rhs.arr[0]), self.arr[1].min(rhs.arr[1]), ]} } } } #[inline] #[must_use] pub fn is_nan(self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_unord_mask_m128d(self.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_ne(self.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u64(vceqq_f64(self.neon, self.neon)) }.not() } } else { Self { arr: [ if self.arr[0].is_nan() { f64::from_bits(u64::MAX) } else { 0.0 }, if self.arr[1].is_nan() { f64::from_bits(u64::MAX) } else { 0.0 }, ]} } } } #[inline] #[must_use] pub fn is_finite(self) -> Self { let shifted_exp_mask = u64x2::from(0xFFE0000000000000); let u: u64x2 = cast(self); let shift_u = u << 1_u64; let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask); cast(out) } #[inline] #[must_use] pub fn is_inf(self) -> Self { let shifted_inf = u64x2::from(0xFFE0000000000000); let u: u64x2 = cast(self); let shift_u = u << 1_u64; let out = (shift_u).cmp_eq(shifted_inf); cast(out) } #[inline] #[must_use] pub fn round(self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: round_m128d::<{round_op!(Nearest)}>(self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_nearest(self.simd) } } else { let sign_mask = f64x2::from(-0.0); let magic = f64x2::from(f64::from_bits(0x43300000_00000000)); let sign = self & sign_mask; let signed_magic = magic | sign; self + signed_magic - signed_magic } } } #[inline] #[must_use] pub fn round_int(self) -> i64x2 { let rounded: [f64; 2] = cast(self.round()); cast([rounded[0] as i64, rounded[1] as i64]) } #[inline] #[must_use] pub fn mul_add(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="fma"))] { Self { sse: fused_mul_add_m128d(self.sse, m.sse, a.sse) } } else { (self * m) + a } } } #[inline] #[must_use] pub fn mul_sub(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="fma"))] { Self { sse: fused_mul_sub_m128d(self.sse, m.sse, a.sse) } } else { (self * m) - a } } } #[inline] #[must_use] pub fn mul_neg_add(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="fma"))] { Self { sse: fused_mul_neg_add_m128d(self.sse, m.sse, a.sse) } } else { a - (self * m) } } } #[inline] #[must_use] pub fn mul_neg_sub(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="fma"))] { Self { sse: fused_mul_neg_sub_m128d(self.sse, m.sse, a.sse) } } else { -(self * m) - a } } } #[inline] #[must_use] pub fn flip_signs(self, signs: Self) -> Self { self ^ (signs & Self::from(-0.0)) } #[inline] #[must_use] pub fn copysign(self, sign: Self) -> Self { let magnitude_mask = Self::from(f64::from_bits(u64::MAX >> 1)); (self & magnitude_mask) | (sign & Self::from(-0.0)) } #[inline] pub fn asin_acos(self) -> (Self, Self) { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x2!(R4asin, 2.967721961301243206100E-3); const_f64_as_f64x2!(R3asin, -5.634242780008963776856E-1); const_f64_as_f64x2!(R2asin, 6.968710824104713396794E0); const_f64_as_f64x2!(R1asin, -2.556901049652824852289E1); const_f64_as_f64x2!(R0asin, 2.853665548261061424989E1); const_f64_as_f64x2!(S3asin, -2.194779531642920639778E1); const_f64_as_f64x2!(S2asin, 1.470656354026814941758E2); const_f64_as_f64x2!(S1asin, -3.838770957603691357202E2); const_f64_as_f64x2!(S0asin, 3.424398657913078477438E2); const_f64_as_f64x2!(P5asin, 4.253011369004428248960E-3); const_f64_as_f64x2!(P4asin, -6.019598008014123785661E-1); const_f64_as_f64x2!(P3asin, 5.444622390564711410273E0); const_f64_as_f64x2!(P2asin, -1.626247967210700244449E1); const_f64_as_f64x2!(P1asin, 1.956261983317594739197E1); const_f64_as_f64x2!(P0asin, -8.198089802484824371615E0); const_f64_as_f64x2!(Q4asin, -1.474091372988853791896E1); const_f64_as_f64x2!(Q3asin, 7.049610280856842141659E1); const_f64_as_f64x2!(Q2asin, -1.471791292232726029859E2); const_f64_as_f64x2!(Q1asin, 1.395105614657485689735E2); const_f64_as_f64x2!(Q0asin, -4.918853881490881290097E1); let xa = self.abs(); let big = xa.cmp_ge(f64x2::splat(0.625)); let x1 = big.blend(f64x2::splat(1.0) - xa, xa * xa); let x2 = x1 * x1; let x3 = x2 * x1; let x4 = x2 * x2; let x5 = x4 * x1; let do_big = big.any(); let do_small = !big.all(); let mut rx = f64x2::default(); let mut sx = f64x2::default(); let mut px = f64x2::default(); let mut qx = f64x2::default(); if do_big { rx = x3.mul_add(R3asin, x2 * R2asin) + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin)); sx = x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin)); } if do_small { px = x3.mul_add(P3asin, P0asin) + x4.mul_add(P4asin, x1 * P1asin) + x5.mul_add(P5asin, x2 * P2asin); qx = x4.mul_add(Q4asin, x5) + x3.mul_add(Q3asin, x1 * Q1asin) + x2.mul_add(Q2asin, Q0asin); }; let vx = big.blend(rx, px); let wx = big.blend(sx, qx); let y1 = vx / wx * x1; let mut z1 = f64x2::default(); let mut z2 = f64x2::default(); if do_big { let xb = (x1 + x1).sqrt(); z1 = xb.mul_add(y1, xb); } if do_small { z2 = xa.mul_add(y1, xa); } // asin let z3 = f64x2::FRAC_PI_2 - z1; let asin = big.blend(z3, z2); let asin = asin.flip_signs(self); // acos let z3 = self.cmp_lt(f64x2::ZERO).blend(f64x2::PI - z1, z1); let z4 = f64x2::FRAC_PI_2 - z2.flip_signs(self); let acos = big.blend(z3, z4); (asin, acos) } #[inline] pub fn acos(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x2!(R4asin, 2.967721961301243206100E-3); const_f64_as_f64x2!(R3asin, -5.634242780008963776856E-1); const_f64_as_f64x2!(R2asin, 6.968710824104713396794E0); const_f64_as_f64x2!(R1asin, -2.556901049652824852289E1); const_f64_as_f64x2!(R0asin, 2.853665548261061424989E1); const_f64_as_f64x2!(S3asin, -2.194779531642920639778E1); const_f64_as_f64x2!(S2asin, 1.470656354026814941758E2); const_f64_as_f64x2!(S1asin, -3.838770957603691357202E2); const_f64_as_f64x2!(S0asin, 3.424398657913078477438E2); const_f64_as_f64x2!(P5asin, 4.253011369004428248960E-3); const_f64_as_f64x2!(P4asin, -6.019598008014123785661E-1); const_f64_as_f64x2!(P3asin, 5.444622390564711410273E0); const_f64_as_f64x2!(P2asin, -1.626247967210700244449E1); const_f64_as_f64x2!(P1asin, 1.956261983317594739197E1); const_f64_as_f64x2!(P0asin, -8.198089802484824371615E0); const_f64_as_f64x2!(Q4asin, -1.474091372988853791896E1); const_f64_as_f64x2!(Q3asin, 7.049610280856842141659E1); const_f64_as_f64x2!(Q2asin, -1.471791292232726029859E2); const_f64_as_f64x2!(Q1asin, 1.395105614657485689735E2); const_f64_as_f64x2!(Q0asin, -4.918853881490881290097E1); let xa = self.abs(); let big = xa.cmp_ge(f64x2::splat(0.625)); let x1 = big.blend(f64x2::splat(1.0) - xa, xa * xa); let x2 = x1 * x1; let x3 = x2 * x1; let x4 = x2 * x2; let x5 = x4 * x1; let do_big = big.any(); let do_small = !big.all(); let mut rx = f64x2::default(); let mut sx = f64x2::default(); let mut px = f64x2::default(); let mut qx = f64x2::default(); if do_big { rx = x3.mul_add(R3asin, x2 * R2asin) + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin)); sx = x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin)); } if do_small { px = x3.mul_add(P3asin, P0asin) + x4.mul_add(P4asin, x1 * P1asin) + x5.mul_add(P5asin, x2 * P2asin); qx = x4.mul_add(Q4asin, x5) + x3.mul_add(Q3asin, x1 * Q1asin) + x2.mul_add(Q2asin, Q0asin); }; let vx = big.blend(rx, px); let wx = big.blend(sx, qx); let y1 = vx / wx * x1; let mut z1 = f64x2::default(); let mut z2 = f64x2::default(); if do_big { let xb = (x1 + x1).sqrt(); z1 = xb.mul_add(y1, xb); } if do_small { z2 = xa.mul_add(y1, xa); } // acos let z3 = self.cmp_lt(f64x2::ZERO).blend(f64x2::PI - z1, z1); let z4 = f64x2::FRAC_PI_2 - z2.flip_signs(self); let acos = big.blend(z3, z4); acos } #[inline] pub fn asin(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x2!(R4asin, 2.967721961301243206100E-3); const_f64_as_f64x2!(R3asin, -5.634242780008963776856E-1); const_f64_as_f64x2!(R2asin, 6.968710824104713396794E0); const_f64_as_f64x2!(R1asin, -2.556901049652824852289E1); const_f64_as_f64x2!(R0asin, 2.853665548261061424989E1); const_f64_as_f64x2!(S3asin, -2.194779531642920639778E1); const_f64_as_f64x2!(S2asin, 1.470656354026814941758E2); const_f64_as_f64x2!(S1asin, -3.838770957603691357202E2); const_f64_as_f64x2!(S0asin, 3.424398657913078477438E2); const_f64_as_f64x2!(P5asin, 4.253011369004428248960E-3); const_f64_as_f64x2!(P4asin, -6.019598008014123785661E-1); const_f64_as_f64x2!(P3asin, 5.444622390564711410273E0); const_f64_as_f64x2!(P2asin, -1.626247967210700244449E1); const_f64_as_f64x2!(P1asin, 1.956261983317594739197E1); const_f64_as_f64x2!(P0asin, -8.198089802484824371615E0); const_f64_as_f64x2!(Q4asin, -1.474091372988853791896E1); const_f64_as_f64x2!(Q3asin, 7.049610280856842141659E1); const_f64_as_f64x2!(Q2asin, -1.471791292232726029859E2); const_f64_as_f64x2!(Q1asin, 1.395105614657485689735E2); const_f64_as_f64x2!(Q0asin, -4.918853881490881290097E1); let xa = self.abs(); let big = xa.cmp_ge(f64x2::splat(0.625)); let x1 = big.blend(f64x2::splat(1.0) - xa, xa * xa); let x2 = x1 * x1; let x3 = x2 * x1; let x4 = x2 * x2; let x5 = x4 * x1; let do_big = big.any(); let do_small = !big.all(); let mut rx = f64x2::default(); let mut sx = f64x2::default(); let mut px = f64x2::default(); let mut qx = f64x2::default(); if do_big { rx = x3.mul_add(R3asin, x2 * R2asin) + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin)); sx = x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin)); } if do_small { px = x3.mul_add(P3asin, P0asin) + x4.mul_add(P4asin, x1 * P1asin) + x5.mul_add(P5asin, x2 * P2asin); qx = x4.mul_add(Q4asin, x5) + x3.mul_add(Q3asin, x1 * Q1asin) + x2.mul_add(Q2asin, Q0asin); }; let vx = big.blend(rx, px); let wx = big.blend(sx, qx); let y1 = vx / wx * x1; let mut z1 = f64x2::default(); let mut z2 = f64x2::default(); if do_big { let xb = (x1 + x1).sqrt(); z1 = xb.mul_add(y1, xb); } if do_small { z2 = xa.mul_add(y1, xa); } // asin let z3 = f64x2::FRAC_PI_2 - z1; let asin = big.blend(z3, z2); let asin = asin.flip_signs(self); asin } #[inline] pub fn atan(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x2!(MORE_BITS, 6.123233995736765886130E-17); const_f64_as_f64x2!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5); const_f64_as_f64x2!(T3PO8, core::f64::consts::SQRT_2 + 1.0); const_f64_as_f64x2!(P4atan, -8.750608600031904122785E-1); const_f64_as_f64x2!(P3atan, -1.615753718733365076637E1); const_f64_as_f64x2!(P2atan, -7.500855792314704667340E1); const_f64_as_f64x2!(P1atan, -1.228866684490136173410E2); const_f64_as_f64x2!(P0atan, -6.485021904942025371773E1); const_f64_as_f64x2!(Q4atan, 2.485846490142306297962E1); const_f64_as_f64x2!(Q3atan, 1.650270098316988542046E2); const_f64_as_f64x2!(Q2atan, 4.328810604912902668951E2); const_f64_as_f64x2!(Q1atan, 4.853903996359136964868E2); const_f64_as_f64x2!(Q0atan, 1.945506571482613964425E2); let t = self.abs(); // small: t < 0.66 // medium: t <= t <= 2.4142 (1+sqrt(2)) // big: t > 2.4142 let notbig = t.cmp_le(T3PO8); let notsmal = t.cmp_ge(Self::splat(0.66)); let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2); s = notsmal & s; let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS); fac = notsmal & fac; // small: z = t / 1.0; // medium: z = (t-1.0) / (t+1.0); // big: z = -1.0 / t; let mut a = notbig & t; a = notsmal.blend(a - Self::ONE, a); let mut b = notbig & Self::ONE; b = notsmal.blend(b + t, b); let z = a / b; let zz = z * z; let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan); let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan); let mut re = (px / qx).mul_add(z * zz, z); re += s + fac; // get sign bit re = (self.sign_bit()).blend(-re, re); re } #[inline] pub fn atan2(self, x: Self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x2!(MORE_BITS, 6.123233995736765886130E-17); const_f64_as_f64x2!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5); const_f64_as_f64x2!(T3PO8, core::f64::consts::SQRT_2 + 1.0); const_f64_as_f64x2!(P4atan, -8.750608600031904122785E-1); const_f64_as_f64x2!(P3atan, -1.615753718733365076637E1); const_f64_as_f64x2!(P2atan, -7.500855792314704667340E1); const_f64_as_f64x2!(P1atan, -1.228866684490136173410E2); const_f64_as_f64x2!(P0atan, -6.485021904942025371773E1); const_f64_as_f64x2!(Q4atan, 2.485846490142306297962E1); const_f64_as_f64x2!(Q3atan, 1.650270098316988542046E2); const_f64_as_f64x2!(Q2atan, 4.328810604912902668951E2); const_f64_as_f64x2!(Q1atan, 4.853903996359136964868E2); const_f64_as_f64x2!(Q0atan, 1.945506571482613964425E2); let y = self; // move in first octant let x1 = x.abs(); let y1 = y.abs(); let swapxy = y1.cmp_gt(x1); // swap x and y if y1 > x1 let mut x2 = swapxy.blend(y1, x1); let mut y2 = swapxy.blend(x1, y1); // check for special case: x and y are both +/- INF let both_infinite = x.is_inf() & y.is_inf(); if both_infinite.any() { let minus_one = -Self::ONE; x2 = both_infinite.blend(x2 & minus_one, x2); y2 = both_infinite.blend(y2 & minus_one, y2); } // x = y = 0 gives NAN here let t = y2 / x2; // small: t < 0.66 // medium: t <= t <= 2.4142 (1+sqrt(2)) // big: t > 2.4142 let notbig = t.cmp_le(T3PO8); let notsmal = t.cmp_ge(Self::splat(0.66)); let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2); s = notsmal & s; let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS); fac = notsmal & fac; // small: z = t / 1.0; // medium: z = (t-1.0) / (t+1.0); // big: z = -1.0 / t; let mut a = notbig & t; a = notsmal.blend(a - Self::ONE, a); let mut b = notbig & Self::ONE; b = notsmal.blend(b + t, b); let z = a / b; let zz = z * z; let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan); let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan); let mut re = (px / qx).mul_add(z * zz, z); re += s + fac; // move back in place re = swapxy.blend(Self::FRAC_PI_2 - re, re); re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re); re = (x.sign_bit()).blend(Self::PI - re, re); // get sign bit re = (y.sign_bit()).blend(-re, re); re } #[inline] #[must_use] pub fn sin_cos(self) -> (Self, Self) { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x2!(P0sin, -1.66666666666666307295E-1); const_f64_as_f64x2!(P1sin, 8.33333333332211858878E-3); const_f64_as_f64x2!(P2sin, -1.98412698295895385996E-4); const_f64_as_f64x2!(P3sin, 2.75573136213857245213E-6); const_f64_as_f64x2!(P4sin, -2.50507477628578072866E-8); const_f64_as_f64x2!(P5sin, 1.58962301576546568060E-10); const_f64_as_f64x2!(P0cos, 4.16666666666665929218E-2); const_f64_as_f64x2!(P1cos, -1.38888888888730564116E-3); const_f64_as_f64x2!(P2cos, 2.48015872888517045348E-5); const_f64_as_f64x2!(P3cos, -2.75573141792967388112E-7); const_f64_as_f64x2!(P4cos, 2.08757008419747316778E-9); const_f64_as_f64x2!(P5cos, -1.13585365213876817300E-11); const_f64_as_f64x2!(DP1, 7.853981554508209228515625E-1 * 2.); const_f64_as_f64x2!(DP2, 7.94662735614792836714E-9 * 2.); const_f64_as_f64x2!(DP3, 3.06161699786838294307E-17 * 2.); const_f64_as_f64x2!(TWO_OVER_PI, 2.0 / core::f64::consts::PI); let xa = self.abs(); let y = (xa * TWO_OVER_PI).round(); let q = y.round_int(); let x = y.mul_neg_add(DP3, y.mul_neg_add(DP2, y.mul_neg_add(DP1, xa))); let x2 = x * x; let mut s = polynomial_5!(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin); let mut c = polynomial_5!(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos); s = (x * x2).mul_add(s, x); c = (x2 * x2).mul_add(c, x2.mul_neg_add(f64x2::from(0.5), f64x2::from(1.0))); let swap = !((q & i64x2::from(1)).cmp_eq(i64x2::from(0))); let mut overflow: f64x2 = cast(q.cmp_gt(i64x2::from(0x80000000000000))); overflow &= xa.is_finite(); s = overflow.blend(f64x2::from(0.0), s); c = overflow.blend(f64x2::from(1.0), c); // calc sin let mut sin1 = cast::<_, f64x2>(swap).blend(c, s); let sign_sin: i64x2 = (q << 62) ^ cast::<_, i64x2>(self); sin1 = sin1.flip_signs(cast(sign_sin)); // calc cos let mut cos1 = cast::<_, f64x2>(swap).blend(s, c); let sign_cos: i64x2 = ((q + i64x2::from(1)) & i64x2::from(2)) << 62; cos1 ^= cast::<_, f64x2>(sign_cos); (sin1, cos1) } #[inline] #[must_use] pub fn sin(self) -> Self { let (s, _) = self.sin_cos(); s } #[inline] #[must_use] pub fn cos(self) -> Self { let (_, c) = self.sin_cos(); c } #[inline] #[must_use] pub fn tan(self) -> Self { let (s, c) = self.sin_cos(); s / c } #[inline] #[must_use] pub fn to_degrees(self) -> Self { const_f64_as_f64x2!(RAD_TO_DEG_RATIO, 180.0_f64 / core::f64::consts::PI); self * RAD_TO_DEG_RATIO } #[inline] #[must_use] pub fn to_radians(self) -> Self { const_f64_as_f64x2!(DEG_TO_RAD_RATIO, core::f64::consts::PI / 180.0_f64); self * DEG_TO_RAD_RATIO } #[inline] #[must_use] pub fn sqrt(self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sqrt_m128d(self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_sqrt(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vsqrtq_f64(self.neon) }} } else if #[cfg(feature="std")] { Self { arr: [ self.arr[0].sqrt(), self.arr[1].sqrt(), ]} } else { Self { arr: [ software_sqrt(self.arr[0]), software_sqrt(self.arr[1]), ]} } } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="sse2")] { move_mask_m128d(self.sse) } else if #[cfg(target_feature="simd128")] { u64x2_bitmask(self.simd) as i32 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { let e = vreinterpretq_u64_f64(self.neon); (vgetq_lane_u64(e,0) >> 63 | ((vgetq_lane_u64(e,1) >> 62) & 0x2)) as i32 } } else { (((self.arr[0].to_bits() as i64) < 0) as i32) << 0 | (((self.arr[1].to_bits() as i64) < 0) as i32) << 1 } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="simd128")] { v128_any_true(self.simd) } else { self.move_mask() != 0 } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="simd128")] { u64x2_all_true(self.simd) } else { // two lanes self.move_mask() == 0b11 } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] fn vm_pow2n(self) -> Self { const_f64_as_f64x2!(pow2_52, 4503599627370496.0); const_f64_as_f64x2!(bias, 1023.0); let a = self + (bias + pow2_52); let c = cast::<_, i64x2>(a) << 52; cast::<_, f64x2>(c) } /// Calculate the exponent of a packed `f64x2` #[inline] #[must_use] pub fn exp(self) -> Self { const_f64_as_f64x2!(P2, 1.0 / 2.0); const_f64_as_f64x2!(P3, 1.0 / 6.0); const_f64_as_f64x2!(P4, 1. / 24.); const_f64_as_f64x2!(P5, 1. / 120.); const_f64_as_f64x2!(P6, 1. / 720.); const_f64_as_f64x2!(P7, 1. / 5040.); const_f64_as_f64x2!(P8, 1. / 40320.); const_f64_as_f64x2!(P9, 1. / 362880.); const_f64_as_f64x2!(P10, 1. / 3628800.); const_f64_as_f64x2!(P11, 1. / 39916800.); const_f64_as_f64x2!(P12, 1. / 479001600.); const_f64_as_f64x2!(P13, 1. / 6227020800.); const_f64_as_f64x2!(LN2D_HI, 0.693145751953125); const_f64_as_f64x2!(LN2D_LO, 1.42860682030941723212E-6); let max_x = f64x2::from(708.39); let r = (self * Self::LOG2_E).round(); let x = r.mul_neg_add(LN2D_HI, self); let x = r.mul_neg_add(LN2D_LO, x); let z = polynomial_13!(x, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13); let n2 = Self::vm_pow2n(r); let z = (z + Self::ONE) * n2; // check for overflow let in_range = self.abs().cmp_lt(max_x); let in_range = in_range & self.is_finite(); in_range.blend(z, Self::ZERO) } #[inline] fn exponent(self) -> f64x2 { const_f64_as_f64x2!(pow2_52, 4503599627370496.0); const_f64_as_f64x2!(bias, 1023.0); let a = cast::<_, u64x2>(self); let b = a >> 52; let c = b | cast::<_, u64x2>(pow2_52); let d = cast::<_, f64x2>(c); let e = d - (pow2_52 + bias); e } #[inline] fn fraction_2(self) -> Self { let t1 = cast::<_, u64x2>(self); let t2 = cast::<_, u64x2>( (t1 & u64x2::from(0x000FFFFFFFFFFFFF)) | u64x2::from(0x3FE0000000000000), ); cast::<_, f64x2>(t2) } #[inline] fn is_zero_or_subnormal(self) -> Self { let t = cast::<_, i64x2>(self); let t = t & i64x2::splat(0x7FF0000000000000); i64x2::round_float(t.cmp_eq(i64x2::splat(0))) } #[inline] fn infinity() -> Self { cast::<_, f64x2>(i64x2::splat(0x7FF0000000000000)) } #[inline] fn nan_log() -> Self { cast::<_, f64x2>(i64x2::splat(0x7FF8000000000000 | 0x101 << 29)) } #[inline] fn nan_pow() -> Self { cast::<_, f64x2>(i64x2::splat(0x7FF8000000000000 | 0x101 << 29)) } #[inline] fn sign_bit(self) -> Self { let t1 = cast::<_, i64x2>(self); let t2 = t1 >> 63; !cast::<_, f64x2>(t2).cmp_eq(f64x2::ZERO) } /// horizontal add of all the elements of the vector #[inline] #[must_use] pub fn reduce_add(self) -> f64 { pick! { if #[cfg(target_feature="ssse3")] { let a = add_horizontal_m128d(self.sse, self.sse); a.to_array()[0] } else if #[cfg(any(target_feature="sse2", target_feature="simd128"))] { let a: [f64;2] = cast(self); a.iter().sum() } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { vgetq_lane_f64(self.neon,0) + vgetq_lane_f64(self.neon,1) } } else { self.arr.iter().sum() } } } #[inline] #[must_use] pub fn ln(self) -> Self { const_f64_as_f64x2!(P0, 7.70838733755885391666E0); const_f64_as_f64x2!(P1, 1.79368678507819816313E1); const_f64_as_f64x2!(P2, 1.44989225341610930846E1); const_f64_as_f64x2!(P3, 4.70579119878881725854E0); const_f64_as_f64x2!(P4, 4.97494994976747001425E-1); const_f64_as_f64x2!(P5, 1.01875663804580931796E-4); const_f64_as_f64x2!(Q0, 2.31251620126765340583E1); const_f64_as_f64x2!(Q1, 7.11544750618563894466E1); const_f64_as_f64x2!(Q2, 8.29875266912776603211E1); const_f64_as_f64x2!(Q3, 4.52279145837532221105E1); const_f64_as_f64x2!(Q4, 1.12873587189167450590E1); const_f64_as_f64x2!(LN2F_HI, 0.693359375); const_f64_as_f64x2!(LN2F_LO, -2.12194440e-4); const_f64_as_f64x2!(VM_SQRT2, 1.414213562373095048801); const_f64_as_f64x2!(VM_SMALLEST_NORMAL, 1.17549435E-38); let x1 = self; let x = Self::fraction_2(x1); let e = Self::exponent(x1); let mask = x.cmp_gt(VM_SQRT2 * f64x2::HALF); let x = (!mask).blend(x + x, x); let fe = mask.blend(e + Self::ONE, e); let x = x - Self::ONE; let px = polynomial_5!(x, P0, P1, P2, P3, P4, P5); let x2 = x * x; let px = x2 * x * px; let qx = polynomial_5n!(x, Q0, Q1, Q2, Q3, Q4); let res = px / qx; let res = fe.mul_add(LN2F_LO, res); let res = res + x2.mul_neg_add(f64x2::HALF, x); let res = fe.mul_add(LN2F_HI, res); let overflow = !self.is_finite(); let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL); let mask = overflow | underflow; if !mask.any() { res } else { let is_zero = self.is_zero_or_subnormal(); let res = underflow.blend(Self::nan_log(), res); let res = is_zero.blend(Self::infinity(), res); let res = overflow.blend(self, res); res } } #[inline] #[must_use] pub fn log2(self) -> Self { Self::ln(self) * Self::LOG2_E } #[inline] #[must_use] pub fn log10(self) -> Self { Self::ln(self) * Self::LOG10_E } #[inline] #[must_use] pub fn pow_f64x2(self, y: Self) -> Self { const_f64_as_f64x2!(ln2d_hi, 0.693145751953125); const_f64_as_f64x2!(ln2d_lo, 1.42860682030941723212E-6); const_f64_as_f64x2!(P0log, 2.0039553499201281259648E1); const_f64_as_f64x2!(P1log, 5.7112963590585538103336E1); const_f64_as_f64x2!(P2log, 6.0949667980987787057556E1); const_f64_as_f64x2!(P3log, 2.9911919328553073277375E1); const_f64_as_f64x2!(P4log, 6.5787325942061044846969E0); const_f64_as_f64x2!(P5log, 4.9854102823193375972212E-1); const_f64_as_f64x2!(P6log, 4.5270000862445199635215E-5); const_f64_as_f64x2!(Q0log, 6.0118660497603843919306E1); const_f64_as_f64x2!(Q1log, 2.1642788614495947685003E2); const_f64_as_f64x2!(Q2log, 3.0909872225312059774938E2); const_f64_as_f64x2!(Q3log, 2.2176239823732856465394E2); const_f64_as_f64x2!(Q4log, 8.3047565967967209469434E1); const_f64_as_f64x2!(Q5log, 1.5062909083469192043167E1); // Taylor expansion constants const_f64_as_f64x2!(p2, 1.0 / 2.0); // coefficients for Taylor expansion of exp const_f64_as_f64x2!(p3, 1.0 / 6.0); const_f64_as_f64x2!(p4, 1.0 / 24.0); const_f64_as_f64x2!(p5, 1.0 / 120.0); const_f64_as_f64x2!(p6, 1.0 / 720.0); const_f64_as_f64x2!(p7, 1.0 / 5040.0); const_f64_as_f64x2!(p8, 1.0 / 40320.0); const_f64_as_f64x2!(p9, 1.0 / 362880.0); const_f64_as_f64x2!(p10, 1.0 / 3628800.0); const_f64_as_f64x2!(p11, 1.0 / 39916800.0); const_f64_as_f64x2!(p12, 1.0 / 479001600.0); const_f64_as_f64x2!(p13, 1.0 / 6227020800.0); let x1 = self.abs(); let x = x1.fraction_2(); let mask = x.cmp_gt(f64x2::SQRT_2 * f64x2::HALF); let x = (!mask).blend(x + x, x); let x = x - f64x2::ONE; let x2 = x * x; let px = polynomial_6!(x, P0log, P1log, P2log, P3log, P4log, P5log, P6log); let px = px * x * x2; let qx = polynomial_6n!(x, Q0log, Q1log, Q2log, Q3log, Q4log, Q5log); let lg1 = px / qx; let ef = x1.exponent(); let ef = mask.blend(ef + f64x2::ONE, ef); let e1 = (ef * y).round(); let yr = ef.mul_sub(y, e1); let lg = f64x2::HALF.mul_neg_add(x2, x) + lg1; let x2err = (f64x2::HALF * x).mul_sub(x, f64x2::HALF * x2); let lg_err = f64x2::HALF.mul_add(x2, lg - x) - lg1; let e2 = (lg * y * f64x2::LOG2_E).round(); let v = lg.mul_sub(y, e2 * ln2d_hi); let v = e2.mul_neg_add(ln2d_lo, v); let v = v - (lg_err + x2err).mul_sub(y, yr * f64x2::LN_2); let x = v; let e3 = (x * f64x2::LOG2_E).round(); let x = e3.mul_neg_add(f64x2::LN_2, x); let z = polynomial_13m!(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) + f64x2::ONE; let ee = e1 + e2 + e3; let ei = cast::<_, i64x2>(ee.round_int()); let ej = cast::<_, i64x2>(ei + (cast::<_, i64x2>(z) >> 52)); let overflow = cast::<_, f64x2>(!ej.cmp_lt(i64x2::splat(0x07FF))) | ee.cmp_gt(f64x2::splat(3000.0)); let underflow = cast::<_, f64x2>(!ej.cmp_gt(i64x2::splat(0x000))) | ee.cmp_lt(f64x2::splat(-3000.0)); // Add exponent by integer addition let z = cast::<_, f64x2>(cast::<_, i64x2>(z) + (ei << 52)); // Check for overflow/underflow let z = if (overflow | underflow).any() { let z = underflow.blend(f64x2::ZERO, z); overflow.blend(Self::infinity(), z) } else { z }; // Check for self == 0 let x_zero = self.is_zero_or_subnormal(); let z = x_zero.blend( y.cmp_lt(f64x2::ZERO).blend( Self::infinity(), y.cmp_eq(f64x2::ZERO).blend(f64x2::ONE, f64x2::ZERO), ), z, ); let x_sign = self.sign_bit(); let z = if x_sign.any() { // Y into an integer let yi = y.cmp_eq(y.round()); // Is y odd? let y_odd = cast::<_, i64x2>(y.round_int() << 63).round_float(); let z1 = yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow())); x_sign.blend(z1, z) } else { z }; let x_finite = self.is_finite(); let y_finite = y.is_finite(); let e_finite = ee.is_finite(); if (x_finite & y_finite & (e_finite | x_zero)).all() { return z; } (self.is_nan() | y.is_nan()).blend(self + y, z) } #[inline] pub fn powf(self, y: f64) -> Self { Self::pow_f64x2(self, f64x2::splat(y)) } #[inline] pub fn to_array(self) -> [f64; 2] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[f64; 2] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [f64; 2] { cast_mut(self) } /// Converts the lower two `i32` lanes to two `f64` lanes (and dropping the /// higher two `i32` lanes) #[inline] pub fn from_i32x4_lower2(v: i32x4) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: convert_to_m128d_from_lower2_i32_m128i(v.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: f64x2_convert_low_i32x4(v.simd)} } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { Self { neon: unsafe { vcvtq_f64_s64(vmovl_s32(vget_low_s32(v.neon))) }} } else { Self { arr: [ v.as_array_ref()[0] as f64, v.as_array_ref()[1] as f64, ]} } } } } impl From for f64x2 { /// Converts the lower two `i32` lanes to two `f64` lanes (and dropping the /// higher two `i32` lanes) #[inline] fn from(v: i32x4) -> Self { Self::from_i32x4_lower2(v) } } impl Not for f64x2 { type Output = Self; #[inline] fn not(self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: self.sse.not() } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_not(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(self.neon))) }} } else { Self { arr: [ f64::from_bits(!self.arr[0].to_bits()), f64::from_bits(!self.arr[1].to_bits()), ]} } } } } wide-0.7.32/src/f64x4_.rs000066400000000000000000001234631473735473700147370ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="avx")] { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] pub struct f64x4 { pub(crate) avx: m256d } } else { #[derive(Default, Clone, Copy, PartialEq)] #[repr(C, align(32))] pub struct f64x4 { pub(crate) a: f64x2, pub(crate) b: f64x2 } } } macro_rules! const_f64_as_f64x4 { ($i:ident, $f:expr) => { #[allow(non_upper_case_globals)] pub const $i: f64x4 = f64x4::new([$f; 4]); }; } impl f64x4 { const_f64_as_f64x4!(ONE, 1.0); const_f64_as_f64x4!(ZERO, 0.0); const_f64_as_f64x4!(HALF, 0.5); const_f64_as_f64x4!(E, core::f64::consts::E); const_f64_as_f64x4!(FRAC_1_PI, core::f64::consts::FRAC_1_PI); const_f64_as_f64x4!(FRAC_2_PI, core::f64::consts::FRAC_2_PI); const_f64_as_f64x4!(FRAC_2_SQRT_PI, core::f64::consts::FRAC_2_SQRT_PI); const_f64_as_f64x4!(FRAC_1_SQRT_2, core::f64::consts::FRAC_1_SQRT_2); const_f64_as_f64x4!(FRAC_PI_2, core::f64::consts::FRAC_PI_2); const_f64_as_f64x4!(FRAC_PI_3, core::f64::consts::FRAC_PI_3); const_f64_as_f64x4!(FRAC_PI_4, core::f64::consts::FRAC_PI_4); const_f64_as_f64x4!(FRAC_PI_6, core::f64::consts::FRAC_PI_6); const_f64_as_f64x4!(FRAC_PI_8, core::f64::consts::FRAC_PI_8); const_f64_as_f64x4!(LN_2, core::f64::consts::LN_2); const_f64_as_f64x4!(LN_10, core::f64::consts::LN_10); const_f64_as_f64x4!(LOG2_E, core::f64::consts::LOG2_E); const_f64_as_f64x4!(LOG10_E, core::f64::consts::LOG10_E); const_f64_as_f64x4!(LOG10_2, core::f64::consts::LOG10_2); const_f64_as_f64x4!(LOG2_10, core::f64::consts::LOG2_10); const_f64_as_f64x4!(PI, core::f64::consts::PI); const_f64_as_f64x4!(SQRT_2, core::f64::consts::SQRT_2); const_f64_as_f64x4!(TAU, core::f64::consts::TAU); } unsafe impl Zeroable for f64x4 {} unsafe impl Pod for f64x4 {} impl Add for f64x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: add_m256d(self.avx, rhs.avx) } } else { Self { a : self.a.add(rhs.a), b : self.b.add(rhs.b), } } } } } impl Sub for f64x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: sub_m256d(self.avx, rhs.avx) } } else { Self { a : self.a.sub(rhs.a), b : self.b.sub(rhs.b), } } } } } impl Mul for f64x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: mul_m256d(self.avx, rhs.avx) } } else { Self { a : self.a.mul(rhs.a), b : self.b.mul(rhs.b), } } } } } impl Div for f64x4 { type Output = Self; #[inline] #[must_use] fn div(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: div_m256d(self.avx, rhs.avx) } } else { Self { a : self.a.div(rhs.a), b : self.b.div(rhs.b), } } } } } impl Add for f64x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: f64) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for f64x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: f64) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for f64x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: f64) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Div for f64x4 { type Output = Self; #[inline] #[must_use] fn div(self, rhs: f64) -> Self::Output { self.div(Self::splat(rhs)) } } impl Add for f64 { type Output = f64x4; #[inline] #[must_use] fn add(self, rhs: f64x4) -> Self::Output { f64x4::splat(self).add(rhs) } } impl Sub for f64 { type Output = f64x4; #[inline] #[must_use] fn sub(self, rhs: f64x4) -> Self::Output { f64x4::splat(self).sub(rhs) } } impl Mul for f64 { type Output = f64x4; #[inline] #[must_use] fn mul(self, rhs: f64x4) -> Self::Output { f64x4::splat(self).mul(rhs) } } impl Div for f64 { type Output = f64x4; #[inline] #[must_use] fn div(self, rhs: f64x4) -> Self::Output { f64x4::splat(self).div(rhs) } } impl BitAnd for f64x4 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: bitand_m256d(self.avx, rhs.avx) } } else { Self { a : self.a.bitand(rhs.a), b : self.b.bitand(rhs.b), } } } } } impl BitOr for f64x4 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: bitor_m256d(self.avx, rhs.avx) } } else { Self { a : self.a.bitor(rhs.a), b : self.b.bitor(rhs.b), } } } } } impl BitXor for f64x4 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")] { Self { avx: bitxor_m256d(self.avx, rhs.avx) } } else { Self { a : self.a.bitxor(rhs.a), b : self.b.bitxor(rhs.b), } } } } } impl CmpEq for f64x4 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")]{ Self { avx: cmp_op_mask_m256d::<{cmp_op!(EqualOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_eq(rhs.a), b : self.b.cmp_eq(rhs.b), } } } } } impl CmpGe for f64x4 { type Output = Self; #[inline] #[must_use] fn cmp_ge(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")]{ Self { avx: cmp_op_mask_m256d::<{cmp_op!(GreaterEqualOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_ge(rhs.a), b : self.b.cmp_ge(rhs.b), } } } } } impl CmpGt for f64x4 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")]{ Self { avx: cmp_op_mask_m256d::<{cmp_op!( GreaterThanOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_gt(rhs.a), b : self.b.cmp_gt(rhs.b), } } } } } impl CmpNe for f64x4 { type Output = Self; #[inline] #[must_use] fn cmp_ne(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")]{ Self { avx: cmp_op_mask_m256d::<{cmp_op!(NotEqualOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_ne(rhs.a), b : self.b.cmp_ne(rhs.b), } } } } } impl CmpLe for f64x4 { type Output = Self; #[inline] #[must_use] fn cmp_le(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")]{ Self { avx: cmp_op_mask_m256d::<{cmp_op!(LessEqualOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_le(rhs.a), b : self.b.cmp_le(rhs.b), } } } } } impl CmpLt for f64x4 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx")]{ Self { avx: cmp_op_mask_m256d::<{cmp_op!(LessThanOrdered)}>(self.avx, rhs.avx) } } else { Self { a : self.a.cmp_lt(rhs.a), b : self.b.cmp_lt(rhs.b), } } } } } impl f64x4 { #[inline] #[must_use] pub const fn new(array: [f64; 4]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: blend_varying_m256d(f.avx, t.avx, self.avx) } } else { Self { a : self.a.blend(t.a, f.a), b : self.b.blend(t.b, f.b), } } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="avx")] { let non_sign_bits = f64x4::from(f64::from_bits(i64::MAX as u64)); self & non_sign_bits } else { Self { a : self.a.abs(), b : self.b.abs(), } } } } #[inline] #[must_use] pub fn floor(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: floor_m256d(self.avx) } } else { Self { a : self.a.floor(), b : self.b.floor(), } } } } #[inline] #[must_use] pub fn ceil(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: ceil_m256d(self.avx) } } else { Self { a : self.a.ceil(), b : self.b.ceil(), } } } } /// Calculates the lanewise maximum of both vectors. This is a faster /// implementation than `max`, but it doesn't specify any behavior if NaNs are /// involved. #[inline] #[must_use] pub fn fast_max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: max_m256d(self.avx, rhs.avx) } } else { Self { a : self.a.fast_max(rhs.a), b : self.b.fast_max(rhs.b), } } } } /// Calculates the lanewise maximum of both vectors. If either lane is NaN, /// the other lane gets chosen. Use `fast_max` for a faster implementation /// that doesn't handle NaNs. #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { // max_m256d seems to do rhs < self ? self : rhs. So if there's any NaN // involved, it chooses rhs, so we need to specifically check rhs for // NaN. rhs.is_nan().blend(self, Self { avx: max_m256d(self.avx, rhs.avx) }) } else { Self { a : self.a.max(rhs.a), b : self.b.max(rhs.b), } } } } /// Calculates the lanewise minimum of both vectors. This is a faster /// implementation than `min`, but it doesn't specify any behavior if NaNs are /// involved. #[inline] #[must_use] pub fn fast_min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: min_m256d(self.avx, rhs.avx) } } else { Self { a : self.a.fast_min(rhs.a), b : self.b.fast_min(rhs.b), } } } } /// Calculates the lanewise minimum of both vectors. If either lane is NaN, /// the other lane gets chosen. Use `fast_min` for a faster implementation /// that doesn't handle NaNs. #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx")] { // min_m256d seems to do rhs < self ? self : rhs. So if there's any NaN // involved, it chooses rhs, so we need to specifically check rhs for // NaN. rhs.is_nan().blend(self, Self { avx: min_m256d(self.avx, rhs.avx) }) } else { Self { a : self.a.min(rhs.a), b : self.b.min(rhs.b), } } } } #[inline] #[must_use] pub fn is_nan(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: cmp_op_mask_m256d::<{cmp_op!(Unordered)}>(self.avx, self.avx ) } } else { Self { a : self.a.is_nan(), b : self.b.is_nan(), } } } } #[inline] #[must_use] pub fn is_finite(self) -> Self { let shifted_exp_mask = u64x4::from(0xFFE0000000000000); let u: u64x4 = cast(self); let shift_u = u << 1_u64; let out = !(shift_u & shifted_exp_mask).cmp_eq(shifted_exp_mask); cast(out) } #[inline] #[must_use] pub fn is_inf(self) -> Self { let shifted_inf = u64x4::from(0xFFE0000000000000); let u: u64x4 = cast(self); let shift_u = u << 1_u64; let out = (shift_u).cmp_eq(shifted_inf); cast(out) } #[inline] #[must_use] pub fn round(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: round_m256d::<{round_op!(Nearest)}>(self.avx) } } else { Self { a : self.a.round(), b : self.b.round(), } } } } #[inline] #[must_use] pub fn round_int(self) -> i64x4 { // NOTE:No optimization for this currently available so delegate to LLVM let rounded: [f64; 4] = cast(self.round()); cast([ rounded[0] as i64, rounded[1] as i64, rounded[2] as i64, rounded[3] as i64, ]) } #[inline] #[must_use] pub fn mul_add(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="avx",target_feature="fma"))] { Self { avx: fused_mul_add_m256d(self.avx, m.avx, a.avx) } } else if #[cfg(target_feature="avx")] { // still want to use 256 bit ops (self * m) + a } else { Self { a : self.a.mul_add(m.a, a.a), b : self.b.mul_add(m.b, a.b), } } } } #[inline] #[must_use] pub fn mul_sub(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="avx",target_feature="fma"))] { Self { avx: fused_mul_sub_m256d(self.avx, m.avx, a.avx) } } else if #[cfg(target_feature="avx")] { // still want to use 256 bit ops (self * m) - a } else { Self { a : self.a.mul_sub(m.a, a.a), b : self.b.mul_sub(m.b, a.b), } } } } #[inline] #[must_use] pub fn mul_neg_add(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="avx",target_feature="fma"))] { Self { avx: fused_mul_neg_add_m256d(self.avx, m.avx, a.avx) } } else if #[cfg(target_feature="avx")] { // still want to use 256 bit ops a - (self * m) } else { Self { a : self.a.mul_neg_add(m.a, a.a), b : self.b.mul_neg_add(m.b, a.b), } } } } #[inline] #[must_use] pub fn mul_neg_sub(self, m: Self, a: Self) -> Self { pick! { if #[cfg(all(target_feature="avx",target_feature="fma"))] { Self { avx: fused_mul_neg_sub_m256d(self.avx, m.avx, a.avx) } } else if #[cfg(target_feature="avx")] { // still want to use 256 bit ops -(self * m) - a } else { Self { a : self.a.mul_neg_sub(m.a, a.a), b : self.b.mul_neg_sub(m.b, a.b), } } } } #[inline] #[must_use] pub fn flip_signs(self, signs: Self) -> Self { self ^ (signs & Self::from(-0.0)) } #[inline] #[must_use] pub fn copysign(self, sign: Self) -> Self { let magnitude_mask = Self::from(f64::from_bits(u64::MAX >> 1)); (self & magnitude_mask) | (sign & Self::from(-0.0)) } #[inline] pub fn asin_acos(self) -> (Self, Self) { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3); const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1); const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0); const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1); const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1); const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1); const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2); const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2); const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2); const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3); const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1); const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0); const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1); const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1); const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0); const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1); const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1); const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2); const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2); const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1); let xa = self.abs(); let big = xa.cmp_ge(f64x4::splat(0.625)); let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa); let x2 = x1 * x1; let x3 = x2 * x1; let x4 = x2 * x2; let x5 = x4 * x1; let do_big = big.any(); let do_small = !big.all(); let mut rx = f64x4::default(); let mut sx = f64x4::default(); let mut px = f64x4::default(); let mut qx = f64x4::default(); if do_big { rx = x3.mul_add(R3asin, x2 * R2asin) + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin)); sx = x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin)); } if do_small { px = x3.mul_add(P3asin, P0asin) + x4.mul_add(P4asin, x1 * P1asin) + x5.mul_add(P5asin, x2 * P2asin); qx = x4.mul_add(Q4asin, x5) + x3.mul_add(Q3asin, x1 * Q1asin) + x2.mul_add(Q2asin, Q0asin); }; let vx = big.blend(rx, px); let wx = big.blend(sx, qx); let y1 = vx / wx * x1; let mut z1 = f64x4::default(); let mut z2 = f64x4::default(); if do_big { let xb = (x1 + x1).sqrt(); z1 = xb.mul_add(y1, xb); } if do_small { z2 = xa.mul_add(y1, xa); } // asin let z3 = f64x4::FRAC_PI_2 - z1; let asin = big.blend(z3, z2); let asin = asin.flip_signs(self); // acos let z3 = self.cmp_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1); let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self); let acos = big.blend(z3, z4); (asin, acos) } #[inline] pub fn acos(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3); const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1); const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0); const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1); const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1); const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1); const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2); const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2); const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2); const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3); const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1); const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0); const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1); const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1); const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0); const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1); const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1); const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2); const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2); const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1); let xa = self.abs(); let big = xa.cmp_ge(f64x4::splat(0.625)); let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa); let x2 = x1 * x1; let x3 = x2 * x1; let x4 = x2 * x2; let x5 = x4 * x1; let do_big = big.any(); let do_small = !big.all(); let mut rx = f64x4::default(); let mut sx = f64x4::default(); let mut px = f64x4::default(); let mut qx = f64x4::default(); if do_big { rx = x3.mul_add(R3asin, x2 * R2asin) + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin)); sx = x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin)); } if do_small { px = x3.mul_add(P3asin, P0asin) + x4.mul_add(P4asin, x1 * P1asin) + x5.mul_add(P5asin, x2 * P2asin); qx = x4.mul_add(Q4asin, x5) + x3.mul_add(Q3asin, x1 * Q1asin) + x2.mul_add(Q2asin, Q0asin); }; let vx = big.blend(rx, px); let wx = big.blend(sx, qx); let y1 = vx / wx * x1; let mut z1 = f64x4::default(); let mut z2 = f64x4::default(); if do_big { let xb = (x1 + x1).sqrt(); z1 = xb.mul_add(y1, xb); } if do_small { z2 = xa.mul_add(y1, xa); } // acos let z3 = self.cmp_lt(f64x4::ZERO).blend(f64x4::PI - z1, z1); let z4 = f64x4::FRAC_PI_2 - z2.flip_signs(self); let acos = big.blend(z3, z4); acos } #[inline] #[must_use] pub fn asin(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x4!(R4asin, 2.967721961301243206100E-3); const_f64_as_f64x4!(R3asin, -5.634242780008963776856E-1); const_f64_as_f64x4!(R2asin, 6.968710824104713396794E0); const_f64_as_f64x4!(R1asin, -2.556901049652824852289E1); const_f64_as_f64x4!(R0asin, 2.853665548261061424989E1); const_f64_as_f64x4!(S3asin, -2.194779531642920639778E1); const_f64_as_f64x4!(S2asin, 1.470656354026814941758E2); const_f64_as_f64x4!(S1asin, -3.838770957603691357202E2); const_f64_as_f64x4!(S0asin, 3.424398657913078477438E2); const_f64_as_f64x4!(P5asin, 4.253011369004428248960E-3); const_f64_as_f64x4!(P4asin, -6.019598008014123785661E-1); const_f64_as_f64x4!(P3asin, 5.444622390564711410273E0); const_f64_as_f64x4!(P2asin, -1.626247967210700244449E1); const_f64_as_f64x4!(P1asin, 1.956261983317594739197E1); const_f64_as_f64x4!(P0asin, -8.198089802484824371615E0); const_f64_as_f64x4!(Q4asin, -1.474091372988853791896E1); const_f64_as_f64x4!(Q3asin, 7.049610280856842141659E1); const_f64_as_f64x4!(Q2asin, -1.471791292232726029859E2); const_f64_as_f64x4!(Q1asin, 1.395105614657485689735E2); const_f64_as_f64x4!(Q0asin, -4.918853881490881290097E1); let xa = self.abs(); let big = xa.cmp_ge(f64x4::splat(0.625)); let x1 = big.blend(f64x4::splat(1.0) - xa, xa * xa); let x2 = x1 * x1; let x3 = x2 * x1; let x4 = x2 * x2; let x5 = x4 * x1; let do_big = big.any(); let do_small = !big.all(); let mut rx = f64x4::default(); let mut sx = f64x4::default(); let mut px = f64x4::default(); let mut qx = f64x4::default(); if do_big { rx = x3.mul_add(R3asin, x2 * R2asin) + x4.mul_add(R4asin, x1.mul_add(R1asin, R0asin)); sx = x3.mul_add(S3asin, x4) + x2.mul_add(S2asin, x1.mul_add(S1asin, S0asin)); } if do_small { px = x3.mul_add(P3asin, P0asin) + x4.mul_add(P4asin, x1 * P1asin) + x5.mul_add(P5asin, x2 * P2asin); qx = x4.mul_add(Q4asin, x5) + x3.mul_add(Q3asin, x1 * Q1asin) + x2.mul_add(Q2asin, Q0asin); }; let vx = big.blend(rx, px); let wx = big.blend(sx, qx); let y1 = vx / wx * x1; let mut z1 = f64x4::default(); let mut z2 = f64x4::default(); if do_big { let xb = (x1 + x1).sqrt(); z1 = xb.mul_add(y1, xb); } if do_small { z2 = xa.mul_add(y1, xa); } // asin let z3 = f64x4::FRAC_PI_2 - z1; let asin = big.blend(z3, z2); let asin = asin.flip_signs(self); asin } #[inline] pub fn atan(self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x4!(MORE_BITS, 6.123233995736765886130E-17); const_f64_as_f64x4!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5); const_f64_as_f64x4!(T3PO8, core::f64::consts::SQRT_2 + 1.0); const_f64_as_f64x4!(P4atan, -8.750608600031904122785E-1); const_f64_as_f64x4!(P3atan, -1.615753718733365076637E1); const_f64_as_f64x4!(P2atan, -7.500855792314704667340E1); const_f64_as_f64x4!(P1atan, -1.228866684490136173410E2); const_f64_as_f64x4!(P0atan, -6.485021904942025371773E1); const_f64_as_f64x4!(Q4atan, 2.485846490142306297962E1); const_f64_as_f64x4!(Q3atan, 1.650270098316988542046E2); const_f64_as_f64x4!(Q2atan, 4.328810604912902668951E2); const_f64_as_f64x4!(Q1atan, 4.853903996359136964868E2); const_f64_as_f64x4!(Q0atan, 1.945506571482613964425E2); let t = self.abs(); // small: t < 0.66 // medium: t <= t <= 2.4142 (1+sqrt(2)) // big: t > 2.4142 let notbig = t.cmp_le(T3PO8); let notsmal = t.cmp_ge(Self::splat(0.66)); let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2); s = notsmal & s; let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS); fac = notsmal & fac; // small: z = t / 1.0; // medium: z = (t-1.0) / (t+1.0); // big: z = -1.0 / t; let mut a = notbig & t; a = notsmal.blend(a - Self::ONE, a); let mut b = notbig & Self::ONE; b = notsmal.blend(b + t, b); let z = a / b; let zz = z * z; let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan); let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan); let mut re = (px / qx).mul_add(z * zz, z); re += s + fac; // get sign bit re = (self.sign_bit()).blend(-re, re); re } #[inline] pub fn atan2(self, x: Self) -> Self { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x4!(MORE_BITS, 6.123233995736765886130E-17); const_f64_as_f64x4!(MORE_BITS_O2, 6.123233995736765886130E-17 * 0.5); const_f64_as_f64x4!(T3PO8, core::f64::consts::SQRT_2 + 1.0); const_f64_as_f64x4!(P4atan, -8.750608600031904122785E-1); const_f64_as_f64x4!(P3atan, -1.615753718733365076637E1); const_f64_as_f64x4!(P2atan, -7.500855792314704667340E1); const_f64_as_f64x4!(P1atan, -1.228866684490136173410E2); const_f64_as_f64x4!(P0atan, -6.485021904942025371773E1); const_f64_as_f64x4!(Q4atan, 2.485846490142306297962E1); const_f64_as_f64x4!(Q3atan, 1.650270098316988542046E2); const_f64_as_f64x4!(Q2atan, 4.328810604912902668951E2); const_f64_as_f64x4!(Q1atan, 4.853903996359136964868E2); const_f64_as_f64x4!(Q0atan, 1.945506571482613964425E2); let y = self; // move in first octant let x1 = x.abs(); let y1 = y.abs(); let swapxy = y1.cmp_gt(x1); // swap x and y if y1 > x1 let mut x2 = swapxy.blend(y1, x1); let mut y2 = swapxy.blend(x1, y1); // check for special case: x and y are both +/- INF let both_infinite = x.is_inf() & y.is_inf(); if both_infinite.any() { let minus_one = -Self::ONE; x2 = both_infinite.blend(x2 & minus_one, x2); y2 = both_infinite.blend(y2 & minus_one, y2); } // x = y = 0 gives NAN here let t = y2 / x2; // small: t < 0.66 // medium: t <= t <= 2.4142 (1+sqrt(2)) // big: t > 2.4142 let notbig = t.cmp_le(T3PO8); let notsmal = t.cmp_ge(Self::splat(0.66)); let mut s = notbig.blend(Self::FRAC_PI_4, Self::FRAC_PI_2); s = notsmal & s; let mut fac = notbig.blend(MORE_BITS_O2, MORE_BITS); fac = notsmal & fac; // small: z = t / 1.0; // medium: z = (t-1.0) / (t+1.0); // big: z = -1.0 / t; let mut a = notbig & t; a = notsmal.blend(a - Self::ONE, a); let mut b = notbig & Self::ONE; b = notsmal.blend(b + t, b); let z = a / b; let zz = z * z; let px = polynomial_4!(zz, P0atan, P1atan, P2atan, P3atan, P4atan); let qx = polynomial_5n!(zz, Q0atan, Q1atan, Q2atan, Q3atan, Q4atan); let mut re = (px / qx).mul_add(z * zz, z); re += s + fac; // move back in place re = swapxy.blend(Self::FRAC_PI_2 - re, re); re = ((x | y).cmp_eq(Self::ZERO)).blend(Self::ZERO, re); re = (x.sign_bit()).blend(Self::PI - re, re); // get sign bit re = (y.sign_bit()).blend(-re, re); re } #[inline] #[must_use] pub fn sin_cos(self) -> (Self, Self) { // Based on the Agner Fog "vector class library": // https://github.com/vectorclass/version2/blob/master/vectormath_trig.h const_f64_as_f64x4!(P0sin, -1.66666666666666307295E-1); const_f64_as_f64x4!(P1sin, 8.33333333332211858878E-3); const_f64_as_f64x4!(P2sin, -1.98412698295895385996E-4); const_f64_as_f64x4!(P3sin, 2.75573136213857245213E-6); const_f64_as_f64x4!(P4sin, -2.50507477628578072866E-8); const_f64_as_f64x4!(P5sin, 1.58962301576546568060E-10); const_f64_as_f64x4!(P0cos, 4.16666666666665929218E-2); const_f64_as_f64x4!(P1cos, -1.38888888888730564116E-3); const_f64_as_f64x4!(P2cos, 2.48015872888517045348E-5); const_f64_as_f64x4!(P3cos, -2.75573141792967388112E-7); const_f64_as_f64x4!(P4cos, 2.08757008419747316778E-9); const_f64_as_f64x4!(P5cos, -1.13585365213876817300E-11); const_f64_as_f64x4!(DP1, 7.853981554508209228515625E-1 * 2.); const_f64_as_f64x4!(DP2, 7.94662735614792836714E-9 * 2.); const_f64_as_f64x4!(DP3, 3.06161699786838294307E-17 * 2.); const_f64_as_f64x4!(TWO_OVER_PI, 2.0 / core::f64::consts::PI); let xa = self.abs(); let y = (xa * TWO_OVER_PI).round(); let q = y.round_int(); let x = y.mul_neg_add(DP3, y.mul_neg_add(DP2, y.mul_neg_add(DP1, xa))); let x2 = x * x; let mut s = polynomial_5!(x2, P0sin, P1sin, P2sin, P3sin, P4sin, P5sin); let mut c = polynomial_5!(x2, P0cos, P1cos, P2cos, P3cos, P4cos, P5cos); s = (x * x2).mul_add(s, x); c = (x2 * x2).mul_add(c, x2.mul_neg_add(f64x4::from(0.5), f64x4::from(1.0))); let swap = !((q & i64x4::from(1)).cmp_eq(i64x4::from(0))); let mut overflow: f64x4 = cast(q.cmp_gt(i64x4::from(0x80000000000000))); overflow &= xa.is_finite(); s = overflow.blend(f64x4::from(0.0), s); c = overflow.blend(f64x4::from(1.0), c); // calc sin let mut sin1 = cast::<_, f64x4>(swap).blend(c, s); let sign_sin: i64x4 = (q << 62) ^ cast::<_, i64x4>(self); sin1 = sin1.flip_signs(cast(sign_sin)); // calc cos let mut cos1 = cast::<_, f64x4>(swap).blend(s, c); let sign_cos: i64x4 = ((q + i64x4::from(1)) & i64x4::from(2)) << 62; cos1 ^= cast::<_, f64x4>(sign_cos); (sin1, cos1) } #[inline] #[must_use] pub fn sin(self) -> Self { let (s, _) = self.sin_cos(); s } #[inline] #[must_use] pub fn cos(self) -> Self { let (_, c) = self.sin_cos(); c } #[inline] #[must_use] pub fn tan(self) -> Self { let (s, c) = self.sin_cos(); s / c } #[inline] #[must_use] pub fn to_degrees(self) -> Self { const_f64_as_f64x4!(RAD_TO_DEG_RATIO, 180.0_f64 / core::f64::consts::PI); self * RAD_TO_DEG_RATIO } #[inline] #[must_use] pub fn to_radians(self) -> Self { const_f64_as_f64x4!(DEG_TO_RAD_RATIO, core::f64::consts::PI / 180.0_f64); self * DEG_TO_RAD_RATIO } #[inline] #[must_use] pub fn sqrt(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: sqrt_m256d(self.avx) } } else { Self { a : self.a.sqrt(), b : self.b.sqrt(), } } } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="avx")] { move_mask_m256d(self.avx) } else { (self.b.move_mask() << 2) | self.a.move_mask() } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="avx")] { move_mask_m256d(self.avx) != 0 } else { self.a.any() || self.b.any() } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="avx")] { move_mask_m256d(self.avx) == 0b1111 } else { self.a.all() && self.b.all() } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] fn vm_pow2n(self) -> Self { const_f64_as_f64x4!(pow2_52, 4503599627370496.0); const_f64_as_f64x4!(bias, 1023.0); let a = self + (bias + pow2_52); let c = cast::<_, i64x4>(a) << 52; cast::<_, f64x4>(c) } /// Calculate the exponent of a packed `f64x4` #[inline] #[must_use] pub fn exp(self) -> Self { const_f64_as_f64x4!(P2, 1.0 / 2.0); const_f64_as_f64x4!(P3, 1.0 / 6.0); const_f64_as_f64x4!(P4, 1. / 24.); const_f64_as_f64x4!(P5, 1. / 120.); const_f64_as_f64x4!(P6, 1. / 720.); const_f64_as_f64x4!(P7, 1. / 5040.); const_f64_as_f64x4!(P8, 1. / 40320.); const_f64_as_f64x4!(P9, 1. / 362880.); const_f64_as_f64x4!(P10, 1. / 3628800.); const_f64_as_f64x4!(P11, 1. / 39916800.); const_f64_as_f64x4!(P12, 1. / 479001600.); const_f64_as_f64x4!(P13, 1. / 6227020800.); const_f64_as_f64x4!(LN2D_HI, 0.693145751953125); const_f64_as_f64x4!(LN2D_LO, 1.42860682030941723212E-6); let max_x = f64x4::from(708.39); let r = (self * Self::LOG2_E).round(); let x = r.mul_neg_add(LN2D_HI, self); let x = r.mul_neg_add(LN2D_LO, x); let z = polynomial_13!(x, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13); let n2 = Self::vm_pow2n(r); let z = (z + Self::ONE) * n2; // check for overflow let in_range = self.abs().cmp_lt(max_x); let in_range = in_range & self.is_finite(); in_range.blend(z, Self::ZERO) } #[inline] fn exponent(self) -> f64x4 { const_f64_as_f64x4!(pow2_52, 4503599627370496.0); const_f64_as_f64x4!(bias, 1023.0); let a = cast::<_, u64x4>(self); let b = a >> 52; let c = b | cast::<_, u64x4>(pow2_52); let d = cast::<_, f64x4>(c); let e = d - (pow2_52 + bias); e } #[inline] fn fraction_2(self) -> Self { let t1 = cast::<_, u64x4>(self); let t2 = cast::<_, u64x4>( (t1 & u64x4::from(0x000FFFFFFFFFFFFF)) | u64x4::from(0x3FE0000000000000), ); cast::<_, f64x4>(t2) } #[inline] fn is_zero_or_subnormal(self) -> Self { let t = cast::<_, i64x4>(self); let t = t & i64x4::splat(0x7FF0000000000000); i64x4::round_float(t.cmp_eq(i64x4::splat(0))) } #[inline] fn infinity() -> Self { cast::<_, f64x4>(i64x4::splat(0x7FF0000000000000)) } #[inline] fn nan_log() -> Self { cast::<_, f64x4>(i64x4::splat(0x7FF8000000000000 | 0x101 << 29)) } #[inline] fn nan_pow() -> Self { cast::<_, f64x4>(i64x4::splat(0x7FF8000000000000 | 0x101 << 29)) } #[inline] fn sign_bit(self) -> Self { let t1 = cast::<_, i64x4>(self); let t2 = t1 >> 63; !cast::<_, f64x4>(t2).cmp_eq(f64x4::ZERO) } /// horizontal add of all the elements of the vector #[inline] pub fn reduce_add(self) -> f64 { pick! { if #[cfg(target_feature="avx")] { // From https://stackoverflow.com/questions/49941645/get-sum-of-values-stored-in-m256d-with-sse-avx let lo = cast_to_m128d_from_m256d(self.avx); let hi = extract_m128d_from_m256d::<1>(self.avx); let lo = add_m128d(lo,hi); let hi64 = unpack_high_m128d(lo,lo); let sum = add_m128d_s(lo,hi64); get_f64_from_m128d_s(sum) } else { self.a.reduce_add() + self.b.reduce_add() } } } /// Natural log (ln(x)) #[inline] #[must_use] pub fn ln(self) -> Self { const_f64_as_f64x4!(HALF, 0.5); const_f64_as_f64x4!(P0, 7.70838733755885391666E0); const_f64_as_f64x4!(P1, 1.79368678507819816313E1); const_f64_as_f64x4!(P2, 1.44989225341610930846E1); const_f64_as_f64x4!(P3, 4.70579119878881725854E0); const_f64_as_f64x4!(P4, 4.97494994976747001425E-1); const_f64_as_f64x4!(P5, 1.01875663804580931796E-4); const_f64_as_f64x4!(Q0, 2.31251620126765340583E1); const_f64_as_f64x4!(Q1, 7.11544750618563894466E1); const_f64_as_f64x4!(Q2, 8.29875266912776603211E1); const_f64_as_f64x4!(Q3, 4.52279145837532221105E1); const_f64_as_f64x4!(Q4, 1.12873587189167450590E1); const_f64_as_f64x4!(LN2F_HI, 0.693359375); const_f64_as_f64x4!(LN2F_LO, -2.12194440e-4); const_f64_as_f64x4!(VM_SQRT2, 1.414213562373095048801); const_f64_as_f64x4!(VM_SMALLEST_NORMAL, 1.17549435E-38); let x1 = self; let x = Self::fraction_2(x1); let e = Self::exponent(x1); let mask = x.cmp_gt(VM_SQRT2 * HALF); let x = (!mask).blend(x + x, x); let fe = mask.blend(e + Self::ONE, e); let x = x - Self::ONE; let px = polynomial_5!(x, P0, P1, P2, P3, P4, P5); let x2 = x * x; let px = x2 * x * px; let qx = polynomial_5n!(x, Q0, Q1, Q2, Q3, Q4); let res = px / qx; let res = fe.mul_add(LN2F_LO, res); let res = res + x2.mul_neg_add(HALF, x); let res = fe.mul_add(LN2F_HI, res); let overflow = !self.is_finite(); let underflow = x1.cmp_lt(VM_SMALLEST_NORMAL); let mask = overflow | underflow; if !mask.any() { res } else { let is_zero = self.is_zero_or_subnormal(); let res = underflow.blend(Self::nan_log(), res); let res = is_zero.blend(Self::infinity(), res); let res = overflow.blend(self, res); res } } #[inline] #[must_use] pub fn log2(self) -> Self { Self::ln(self) * Self::LOG2_E } #[inline] #[must_use] pub fn log10(self) -> Self { Self::ln(self) * Self::LOG10_E } #[inline] #[must_use] pub fn pow_f64x4(self, y: Self) -> Self { const_f64_as_f64x4!(ln2d_hi, 0.693145751953125); const_f64_as_f64x4!(ln2d_lo, 1.42860682030941723212E-6); const_f64_as_f64x4!(P0log, 2.0039553499201281259648E1); const_f64_as_f64x4!(P1log, 5.7112963590585538103336E1); const_f64_as_f64x4!(P2log, 6.0949667980987787057556E1); const_f64_as_f64x4!(P3log, 2.9911919328553073277375E1); const_f64_as_f64x4!(P4log, 6.5787325942061044846969E0); const_f64_as_f64x4!(P5log, 4.9854102823193375972212E-1); const_f64_as_f64x4!(P6log, 4.5270000862445199635215E-5); const_f64_as_f64x4!(Q0log, 6.0118660497603843919306E1); const_f64_as_f64x4!(Q1log, 2.1642788614495947685003E2); const_f64_as_f64x4!(Q2log, 3.0909872225312059774938E2); const_f64_as_f64x4!(Q3log, 2.2176239823732856465394E2); const_f64_as_f64x4!(Q4log, 8.3047565967967209469434E1); const_f64_as_f64x4!(Q5log, 1.5062909083469192043167E1); // Taylor expansion constants const_f64_as_f64x4!(p2, 1.0 / 2.0); // coefficients for Taylor expansion of exp const_f64_as_f64x4!(p3, 1.0 / 6.0); const_f64_as_f64x4!(p4, 1.0 / 24.0); const_f64_as_f64x4!(p5, 1.0 / 120.0); const_f64_as_f64x4!(p6, 1.0 / 720.0); const_f64_as_f64x4!(p7, 1.0 / 5040.0); const_f64_as_f64x4!(p8, 1.0 / 40320.0); const_f64_as_f64x4!(p9, 1.0 / 362880.0); const_f64_as_f64x4!(p10, 1.0 / 3628800.0); const_f64_as_f64x4!(p11, 1.0 / 39916800.0); const_f64_as_f64x4!(p12, 1.0 / 479001600.0); const_f64_as_f64x4!(p13, 1.0 / 6227020800.0); let x1 = self.abs(); let x = x1.fraction_2(); let mask = x.cmp_gt(f64x4::SQRT_2 * f64x4::HALF); let x = (!mask).blend(x + x, x); let x = x - f64x4::ONE; let x2 = x * x; let px = polynomial_6!(x, P0log, P1log, P2log, P3log, P4log, P5log, P6log); let px = px * x * x2; let qx = polynomial_6n!(x, Q0log, Q1log, Q2log, Q3log, Q4log, Q5log); let lg1 = px / qx; let ef = x1.exponent(); let ef = mask.blend(ef + f64x4::ONE, ef); let e1 = (ef * y).round(); let yr = ef.mul_sub(y, e1); let lg = f64x4::HALF.mul_neg_add(x2, x) + lg1; let x2err = (f64x4::HALF * x).mul_sub(x, f64x4::HALF * x2); let lg_err = f64x4::HALF.mul_add(x2, lg - x) - lg1; let e2 = (lg * y * f64x4::LOG2_E).round(); let v = lg.mul_sub(y, e2 * ln2d_hi); let v = e2.mul_neg_add(ln2d_lo, v); let v = v - (lg_err + x2err).mul_sub(y, yr * f64x4::LN_2); let x = v; let e3 = (x * f64x4::LOG2_E).round(); let x = e3.mul_neg_add(f64x4::LN_2, x); let z = polynomial_13m!(x, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13) + f64x4::ONE; let ee = e1 + e2 + e3; let ei = cast::<_, i64x4>(ee.round_int()); let ej = cast::<_, i64x4>(ei + (cast::<_, i64x4>(z) >> 52)); let overflow = cast::<_, f64x4>(!ej.cmp_lt(i64x4::splat(0x07FF))) | ee.cmp_gt(f64x4::splat(3000.0)); let underflow = cast::<_, f64x4>(!ej.cmp_gt(i64x4::splat(0x000))) | ee.cmp_lt(f64x4::splat(-3000.0)); // Add exponent by integer addition let z = cast::<_, f64x4>(cast::<_, i64x4>(z) + (ei << 52)); // Check for overflow/underflow let z = if (overflow | underflow).any() { let z = underflow.blend(f64x4::ZERO, z); overflow.blend(Self::infinity(), z) } else { z }; // Check for self == 0 let x_zero = self.is_zero_or_subnormal(); let z = x_zero.blend( y.cmp_lt(f64x4::ZERO).blend( Self::infinity(), y.cmp_eq(f64x4::ZERO).blend(f64x4::ONE, f64x4::ZERO), ), z, ); let x_sign = self.sign_bit(); let z = if x_sign.any() { // Y into an integer let yi = y.cmp_eq(y.round()); // Is y odd? let y_odd = cast::<_, i64x4>(y.round_int() << 63).round_float(); let z1 = yi.blend(z | y_odd, self.cmp_eq(Self::ZERO).blend(z, Self::nan_pow())); x_sign.blend(z1, z) } else { z }; let x_finite = self.is_finite(); let y_finite = y.is_finite(); let e_finite = ee.is_finite(); if (x_finite & y_finite & (e_finite | x_zero)).all() { return z; } (self.is_nan() | y.is_nan()).blend(self + y, z) } #[inline] pub fn powf(self, y: f64) -> Self { Self::pow_f64x4(self, f64x4::splat(y)) } #[inline] pub fn to_array(self) -> [f64; 4] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[f64; 4] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [f64; 4] { cast_mut(self) } #[inline] pub fn from_i32x4(v: i32x4) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: convert_to_m256d_from_i32_m128i(v.sse) } } else { Self::new([ v.as_array_ref()[0] as f64, v.as_array_ref()[1] as f64, v.as_array_ref()[2] as f64, v.as_array_ref()[3] as f64, ]) } } } } impl From for f64x4 { #[inline] fn from(v: i32x4) -> Self { Self::from_i32x4(v) } } impl Not for f64x4 { type Output = Self; #[inline] fn not(self) -> Self { pick! { if #[cfg(target_feature="avx")] { Self { avx: self.avx.not() } } else { Self { a : self.a.not(), b : self.b.not(), } } } } } wide-0.7.32/src/i16x16_.rs000066400000000000000000000325631473735473700150220ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct i16x16 { pub(crate) avx2: m256i } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct i16x16 { pub(crate) a : i16x8, pub(crate) b : i16x8 } } } int_uint_consts!(i16, 16, i16x16, 256); unsafe impl Zeroable for i16x16 {} unsafe impl Pod for i16x16 {} impl Add for i16x16 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: add_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.add(rhs.a), b : self.b.add(rhs.b), } } } } } impl Sub for i16x16 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: sub_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.sub(rhs.a), b : self.b.sub(rhs.b), } } } } } impl Mul for i16x16 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: mul_i16_keep_low_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.mul(rhs.a), b : self.b.mul(rhs.b), } } } } } impl Add for i16x16 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: i16) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for i16x16 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: i16) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for i16x16 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: i16) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for i16 { type Output = i16x16; #[inline] #[must_use] fn add(self, rhs: i16x16) -> Self::Output { i16x16::splat(self).add(rhs) } } impl Sub for i16 { type Output = i16x16; #[inline] #[must_use] fn sub(self, rhs: i16x16) -> Self::Output { i16x16::splat(self).sub(rhs) } } impl Mul for i16 { type Output = i16x16; #[inline] #[must_use] fn mul(self, rhs: i16x16) -> Self::Output { i16x16::splat(self).mul(rhs) } } impl BitAnd for i16x16 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitand_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitand(rhs.a), b : self.b.bitand(rhs.b), } } } } } impl BitOr for i16x16 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitor(rhs.a), b : self.b.bitor(rhs.b), } } } } } impl BitXor for i16x16 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitxor(rhs.a), b : self.b.bitxor(rhs.b), } } } } } macro_rules! impl_shl_t_for_i16x16 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for i16x16 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shl_all_u16_m256i(self.avx2, shift) } } else { Self { a : self.a.shl(rhs), b : self.b.shl(rhs), } } } } })+ }; } impl_shl_t_for_i16x16!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_i16x16 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for i16x16 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shr_all_i16_m256i(self.avx2, shift) } } else { Self { a : self.a.shr(rhs), b : self.b.shr(rhs), } } } } })+ }; } impl_shr_t_for_i16x16!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); impl CmpEq for i16x16 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: cmp_eq_mask_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.cmp_eq(rhs.a), b : self.b.cmp_eq(rhs.b), } } } } } impl CmpGt for i16x16 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: cmp_gt_mask_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.cmp_gt(rhs.a), b : self.b.cmp_gt(rhs.b), } } } } } impl CmpLt for i16x16 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: !cmp_gt_mask_i16_m256i(self.avx2, rhs.avx2) ^ cmp_eq_mask_i16_m256i(self.avx2,rhs.avx2) } } else { Self { a : self.a.cmp_lt(rhs.a), b : self.b.cmp_lt(rhs.b), } } } } } impl From for i16x16 { /// widen with sign extend from i8 to i16 #[inline] #[must_use] fn from(i: i8x16) -> Self { i16x16::from_i8x16(i) } } impl From for i16x16 { /// widen with zero extend from u8 to i16 #[inline] #[must_use] fn from(i: u8x16) -> Self { cast(u16x16::from(i)) } } impl i16x16 { #[inline] #[must_use] pub const fn new(array: [i16; 16]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="sse2")] { let [a,b] = cast::<_,[m128i;2]>(self); move_mask_i8_m128i( pack_i16_to_i8_m128i(a,b)) } else { self.a.move_mask() | (self.b.move_mask() << 8) } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { ((move_mask_i8_m256i(self.avx2) as u32) & 0b10101010101010101010101010101010) != 0 } else { (self.a | self.b).any() } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { ((move_mask_i8_m256i(self.avx2) as u32) & 0b10101010101010101010101010101010) == 0b10101010101010101010101010101010 } else { (self.a & self.b).all() } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } /// widens and sign extends to i16x16 #[inline] #[must_use] pub fn from_i8x16(v: i8x16) -> Self { pick! { if #[cfg(target_feature="avx2")] { i16x16 { avx2:convert_to_i16_m256i_from_i8_m128i(v.sse) } } else if #[cfg(target_feature="sse4.1")] { i16x16 { a: i16x8 { sse: convert_to_i16_m128i_from_lower8_i8_m128i(v.sse) }, b: i16x8 { sse: convert_to_i16_m128i_from_lower8_i8_m128i(unpack_high_i64_m128i(v.sse, v.sse)) } } } else if #[cfg(target_feature="sse2")] { i16x16 { a: i16x8 { sse: shr_imm_i16_m128i::<8>( unpack_low_i8_m128i(v.sse, v.sse)) }, b: i16x8 { sse: shr_imm_i16_m128i::<8>( unpack_high_i8_m128i(v.sse, v.sse)) }, } } else { i16x16::new([ v.as_array_ref()[0] as i16, v.as_array_ref()[1] as i16, v.as_array_ref()[2] as i16, v.as_array_ref()[3] as i16, v.as_array_ref()[4] as i16, v.as_array_ref()[5] as i16, v.as_array_ref()[6] as i16, v.as_array_ref()[7] as i16, v.as_array_ref()[8] as i16, v.as_array_ref()[9] as i16, v.as_array_ref()[10] as i16, v.as_array_ref()[11] as i16, v.as_array_ref()[12] as i16, v.as_array_ref()[13] as i16, v.as_array_ref()[14] as i16, v.as_array_ref()[15] as i16, ]) } } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: blend_varying_i8_m256i(f.avx2, t.avx2, self.avx2) } } else { Self { a : self.a.blend(t.a, f.a), b : self.b.blend(t.b, f.b), } } } } /// horizontal add of all the elements of the vector #[inline] #[must_use] pub fn reduce_add(self) -> i16 { let arr: [i16x8; 2] = cast(self); (arr[0] + arr[1]).reduce_add() } /// horizontal min of all the elements of the vector #[inline] #[must_use] pub fn reduce_min(self) -> i16 { let arr: [i16x8; 2] = cast(self); arr[0].min(arr[1]).reduce_min() } /// horizontal max of all the elements of the vector #[inline] #[must_use] pub fn reduce_max(self) -> i16 { let arr: [i16x8; 2] = cast(self); arr[0].max(arr[1]).reduce_max() } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: abs_i16_m256i(self.avx2) } } else { Self { a : self.a.abs(), b : self.b.abs(), } } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: max_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.max(rhs.a), b : self.b.max(rhs.b), } } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: min_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.min(rhs.a), b : self.b.min(rhs.b), } } } } #[inline] #[must_use] pub fn saturating_add(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: add_saturating_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.saturating_add(rhs.a), b : self.b.saturating_add(rhs.b), } } } } #[inline] #[must_use] pub fn saturating_sub(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: sub_saturating_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.saturating_sub(rhs.a), b : self.b.saturating_sub(rhs.b), } } } } /// Calculates partial dot product. /// Multiplies packed signed 16-bit integers, producing intermediate signed /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit /// integers. #[inline] #[must_use] pub fn dot(self, rhs: Self) -> i32x8 { pick! { if #[cfg(target_feature="avx2")] { i32x8 { avx2: mul_i16_horizontal_add_m256i(self.avx2, rhs.avx2) } } else { i32x8 { a : self.a.dot(rhs.a), b : self.b.dot(rhs.b), } } } } /// Multiply and scale equivalent to `((self * rhs) + 0x4000) >> 15` on each /// lane, effectively multiplying by a 16 bit fixed point number between `-1` /// and `1`. This corresponds to the following instructions: /// - `vqrdmulhq_n_s16` instruction on neon /// - `i16x8_q15mulr_sat` on simd128 /// - `_mm256_mulhrs_epi16` on avx2 /// - emulated via `mul_i16_*` on sse2 #[inline] #[must_use] pub fn mul_scale_round(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: mul_i16_scale_round_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.mul_scale_round(rhs.a), b : self.b.mul_scale_round(rhs.b), } } } } /// Multiply and scale equivalent to `((self * rhs) + 0x4000) >> 15` on each /// lane, effectively multiplying by a 16 bit fixed point number between `-1` /// and `1`. This corresponds to the following instructions: /// - `vqrdmulhq_n_s16` instruction on neon /// - `i16x8_q15mulr_sat` on simd128 /// - `_mm256_mulhrs_epi16` on avx2 /// - emulated via `mul_i16_*` on sse2 #[inline] #[must_use] pub fn mul_scale_round_n(self, rhs: i16) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: mul_i16_scale_round_m256i(self.avx2, set_splat_i16_m256i(rhs)) } } else { Self { a : self.a.mul_scale_round_n(rhs), b : self.b.mul_scale_round_n(rhs), } } } } #[inline] pub fn to_array(self) -> [i16; 16] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[i16; 16] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [i16; 16] { cast_mut(self) } } wide-0.7.32/src/i16x8_.rs000066400000000000000000001261651473735473700147450ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct i16x8 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct i16x8 { pub(crate) simd: v128 } impl Default for i16x8 { fn default() -> Self { Self::splat(0) } } impl PartialEq for i16x8 { fn eq(&self, other: &Self) -> bool { u16x8_all_true(i16x8_eq(self.simd, other.simd)) } } impl Eq for i16x8 { } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct i16x8 { pub(crate) neon : int16x8_t } impl Default for i16x8 { #[inline] #[must_use] fn default() -> Self { Self::splat(0) } } impl PartialEq for i16x8 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { vminvq_u16(vceqq_s16(self.neon, other.neon))==u16::MAX } } } impl Eq for i16x8 { } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct i16x8 { pub(crate) arr: [i16;8] } } } int_uint_consts!(i16, 8, i16x8, 128); unsafe impl Zeroable for i16x8 {} unsafe impl Pod for i16x8 {} impl Add for i16x8 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_s16(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_add(rhs.arr[0]), self.arr[1].wrapping_add(rhs.arr[1]), self.arr[2].wrapping_add(rhs.arr[2]), self.arr[3].wrapping_add(rhs.arr[3]), self.arr[4].wrapping_add(rhs.arr[4]), self.arr[5].wrapping_add(rhs.arr[5]), self.arr[6].wrapping_add(rhs.arr[6]), self.arr[7].wrapping_add(rhs.arr[7]), ]} } } } } impl Sub for i16x8 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vsubq_s16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].wrapping_sub(rhs.arr[0]), self.arr[1].wrapping_sub(rhs.arr[1]), self.arr[2].wrapping_sub(rhs.arr[2]), self.arr[3].wrapping_sub(rhs.arr[3]), self.arr[4].wrapping_sub(rhs.arr[4]), self.arr[5].wrapping_sub(rhs.arr[5]), self.arr[6].wrapping_sub(rhs.arr[6]), self.arr[7].wrapping_sub(rhs.arr[7]), ]} } } } } impl Mul for i16x8 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: mul_i16_keep_low_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_mul(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmulq_s16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].wrapping_mul(rhs.arr[0]), self.arr[1].wrapping_mul(rhs.arr[1]), self.arr[2].wrapping_mul(rhs.arr[2]), self.arr[3].wrapping_mul(rhs.arr[3]), self.arr[4].wrapping_mul(rhs.arr[4]), self.arr[5].wrapping_mul(rhs.arr[5]), self.arr[6].wrapping_mul(rhs.arr[6]), self.arr[7].wrapping_mul(rhs.arr[7]), ]} } } } } impl Add for i16x8 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: i16) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for i16x8 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: i16) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for i16x8 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: i16) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for i16 { type Output = i16x8; #[inline] #[must_use] fn add(self, rhs: i16x8) -> Self::Output { i16x8::splat(self).add(rhs) } } impl Sub for i16 { type Output = i16x8; #[inline] #[must_use] fn sub(self, rhs: i16x8) -> Self::Output { i16x8::splat(self).sub(rhs) } } impl Mul for i16 { type Output = i16x8; #[inline] #[must_use] fn mul(self, rhs: i16x8) -> Self::Output { i16x8::splat(self).mul(rhs) } } impl BitAnd for i16x8 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitand_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vandq_s16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitand(rhs.arr[0]), self.arr[1].bitand(rhs.arr[1]), self.arr[2].bitand(rhs.arr[2]), self.arr[3].bitand(rhs.arr[3]), self.arr[4].bitand(rhs.arr[4]), self.arr[5].bitand(rhs.arr[5]), self.arr[6].bitand(rhs.arr[6]), self.arr[7].bitand(rhs.arr[7]), ]} } } } } impl BitOr for i16x8 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vorrq_s16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitor(rhs.arr[0]), self.arr[1].bitor(rhs.arr[1]), self.arr[2].bitor(rhs.arr[2]), self.arr[3].bitor(rhs.arr[3]), self.arr[4].bitor(rhs.arr[4]), self.arr[5].bitor(rhs.arr[5]), self.arr[6].bitor(rhs.arr[6]), self.arr[7].bitor(rhs.arr[7]), ]} } } } } impl BitXor for i16x8 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitxor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: veorq_s16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitxor(rhs.arr[0]), self.arr[1].bitxor(rhs.arr[1]), self.arr[2].bitxor(rhs.arr[2]), self.arr[3].bitxor(rhs.arr[3]), self.arr[4].bitxor(rhs.arr[4]), self.arr[5].bitxor(rhs.arr[5]), self.arr[6].bitxor(rhs.arr[6]), self.arr[7].bitxor(rhs.arr[7]), ]} } } } } macro_rules! impl_shl_t_for_i16x8 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for i16x8 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shl_all_u16_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_shl(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16(rhs as i16)) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] << u, self.arr[1] << u, self.arr[2] << u, self.arr[3] << u, self.arr[4] << u, self.arr[5] << u, self.arr[6] << u, self.arr[7] << u, ]} } } } })+ }; } impl_shl_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_i16x8 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for i16x8 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shr_all_i16_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_shr(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_s16(self.neon, vmovq_n_s16( -(rhs as i16))) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] >> u, self.arr[1] >> u, self.arr[2] >> u, self.arr[3] >> u, self.arr[4] >> u, self.arr[5] >> u, self.arr[6] >> u, self.arr[7] >> u, ]} } } } })+ }; } impl_shr_t_for_i16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); impl CmpEq for i16x8 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_eq_mask_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s16_u16(vceqq_s16(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] == rhs.arr[0] { -1 } else { 0 }, if self.arr[1] == rhs.arr[1] { -1 } else { 0 }, if self.arr[2] == rhs.arr[2] { -1 } else { 0 }, if self.arr[3] == rhs.arr[3] { -1 } else { 0 }, if self.arr[4] == rhs.arr[4] { -1 } else { 0 }, if self.arr[5] == rhs.arr[5] { -1 } else { 0 }, if self.arr[6] == rhs.arr[6] { -1 } else { 0 }, if self.arr[7] == rhs.arr[7] { -1 } else { 0 }, ]} } } } } impl CmpGt for i16x8 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_gt_mask_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_gt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s16_u16(vcgtq_s16(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] > rhs.arr[0] { -1 } else { 0 }, if self.arr[1] > rhs.arr[1] { -1 } else { 0 }, if self.arr[2] > rhs.arr[2] { -1 } else { 0 }, if self.arr[3] > rhs.arr[3] { -1 } else { 0 }, if self.arr[4] > rhs.arr[4] { -1 } else { 0 }, if self.arr[5] > rhs.arr[5] { -1 } else { 0 }, if self.arr[6] > rhs.arr[6] { -1 } else { 0 }, if self.arr[7] > rhs.arr[7] { -1 } else { 0 }, ]} } } } } impl CmpLt for i16x8 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_lt_mask_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_lt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s16_u16(vcltq_s16(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] < rhs.arr[0] { -1 } else { 0 }, if self.arr[1] < rhs.arr[1] { -1 } else { 0 }, if self.arr[2] < rhs.arr[2] { -1 } else { 0 }, if self.arr[3] < rhs.arr[3] { -1 } else { 0 }, if self.arr[4] < rhs.arr[4] { -1 } else { 0 }, if self.arr[5] < rhs.arr[5] { -1 } else { 0 }, if self.arr[6] < rhs.arr[6] { -1 } else { 0 }, if self.arr[7] < rhs.arr[7] { -1 } else { 0 }, ]} } } } } impl i16x8 { #[inline] #[must_use] pub const fn new(array: [i16; 8]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="sse2")] { move_mask_i8_m128i( pack_i16_to_i8_m128i(self.sse,self.sse)) & 0xff } else if #[cfg(target_feature="simd128")] { i16x8_bitmask(self.simd) as i32 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { // set all to 1 if top bit is set, else 0 let masked = vcltq_s16(self.neon, vdupq_n_s16(0)); // select the right bit out of each lane let selectbit : uint16x8_t = core::intrinsics::transmute([1u16, 2, 4, 8, 16, 32, 64, 128]); let r = vandq_u16(masked, selectbit); // horizontally add the 16-bit lanes vaddvq_u16(r) as i32 } } else { ((self.arr[0] < 0) as i32) << 0 | ((self.arr[1] < 0) as i32) << 1 | ((self.arr[2] < 0) as i32) << 2 | ((self.arr[3] < 0) as i32) << 3 | ((self.arr[4] < 0) as i32) << 4 | ((self.arr[5] < 0) as i32) << 5 | ((self.arr[6] < 0) as i32) << 6 | ((self.arr[7] < 0) as i32) << 7 } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="sse2")] { (move_mask_i8_m128i(self.sse) & 0b1010101010101010) != 0 } else if #[cfg(target_feature="simd128")] { u16x8_bitmask(self.simd) != 0 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe { vminvq_s16(self.neon) < 0 } } else { let v : [u64;2] = cast(self); ((v[0] | v[1]) & 0x8000800080008000) != 0 } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="sse2")] { (move_mask_i8_m128i(self.sse) & 0b1010101010101010) == 0b1010101010101010 } else if #[cfg(target_feature="simd128")] { u16x8_bitmask(self.simd) == 0b11111111 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe { vmaxvq_s16(self.neon) < 0 } } else { let v : [u64;2] = cast(self); (v[0] & v[1] & 0x8000800080008000) == 0x8000800080008000 } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } /// Unpack the lower half of the input and expand it to `i16` values. #[inline] #[must_use] pub fn from_u8x16_low(u: u8x16) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self{ sse: unpack_low_i8_m128i(u.sse, m128i::zeroed()) } } else { let u_arr: [u8; 16] = cast(u); cast([ u_arr[0] as u16 as i16, u_arr[1] as u16 as i16, u_arr[2] as u16 as i16, u_arr[3] as u16 as i16, u_arr[4] as u16 as i16, u_arr[5] as u16 as i16, u_arr[6] as u16 as i16, u_arr[7] as u16 as i16, ]) } } } /// Unpack the upper half of the input and expand it to `i16` values. #[inline] #[must_use] pub fn from_u8x16_high(u: u8x16) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self{ sse: unpack_high_i8_m128i(u.sse, m128i::zeroed()) } } else { let u_arr: [u8; 16] = cast(u); cast([ u_arr[8] as u16 as i16, u_arr[9] as u16 as i16, u_arr[10] as u16 as i16, u_arr[11] as u16 as i16, u_arr[12] as u16 as i16, u_arr[13] as u16 as i16, u_arr[14] as u16 as i16, u_arr[15] as u16 as i16, ]) } } } /// returns low `i16` of `i32`, saturating values that are too large #[inline] #[must_use] pub fn from_i32x8_saturate(v: i32x8) -> Self { pick! { if #[cfg(target_feature="avx2")] { i16x8 { sse: pack_i32_to_i16_m128i( extract_m128i_from_m256i::<0>(v.avx2), extract_m128i_from_m256i::<1>(v.avx2)) } } else if #[cfg(target_feature="sse2")] { i16x8 { sse: pack_i32_to_i16_m128i( v.a.sse, v.b.sse ) } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; i16x8 { simd: i16x8_narrow_i32x4(v.a.simd, v.b.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { use core::arch::aarch64::*; unsafe { i16x8 { neon: vcombine_s16(vqmovn_s32(v.a.neon), vqmovn_s32(v.b.neon)) } } } else { fn clamp(a : i32) -> i16 { if a < i16::MIN as i32 { i16::MIN } else if a > i16::MAX as i32 { i16::MAX } else { a as i16 } } i16x8::new([ clamp(v.as_array_ref()[0]), clamp(v.as_array_ref()[1]), clamp(v.as_array_ref()[2]), clamp(v.as_array_ref()[3]), clamp(v.as_array_ref()[4]), clamp(v.as_array_ref()[5]), clamp(v.as_array_ref()[6]), clamp(v.as_array_ref()[7]), ]) } } } /// returns low `i16` of `i32`, truncating the upper bits if they are set #[inline] #[must_use] pub fn from_i32x8_truncate(v: i32x8) -> Self { pick! { if #[cfg(target_feature="avx2")] { let a = v.avx2.bitand(set_splat_i32_m256i(0xffff)); i16x8 { sse: pack_i32_to_u16_m128i( extract_m128i_from_m256i::<0>(a), extract_m128i_from_m256i::<1>(a) ) } } else if #[cfg(target_feature="sse2")] { let a = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.a.sse)); let b = shr_imm_i32_m128i::<16>(shl_imm_u32_m128i::<16>(v.b.sse)); i16x8 { sse: pack_i32_to_i16_m128i( a, b) } } else { i16x8::new([ v.as_array_ref()[0] as i16, v.as_array_ref()[1] as i16, v.as_array_ref()[2] as i16, v.as_array_ref()[3] as i16, v.as_array_ref()[4] as i16, v.as_array_ref()[5] as i16, v.as_array_ref()[6] as i16, v.as_array_ref()[7] as i16, ]) } } } #[inline] #[must_use] pub fn from_slice_unaligned(input: &[i16]) -> Self { assert!(input.len() >= 8); pick! { if #[cfg(target_feature="sse2")] { unsafe { Self { sse: load_unaligned_m128i( &*(input.as_ptr() as * const [u8;16]) ) } } } else if #[cfg(target_feature="simd128")] { unsafe { Self { simd: v128_load(input.as_ptr() as *const v128 ) } } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vld1q_s16( input.as_ptr() as *const i16 ) } } } else { // 2018 edition doesn't have try_into unsafe { Self::new( *(input.as_ptr() as * const [i16;8]) ) } } } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vbslq_s16(vreinterpretq_u16_s16(self.neon), t.neon, f.neon) }} } else { generic_bit_blend(self, t, f) } } } #[inline] #[must_use] pub fn is_negative(self) -> Self { self.cmp_lt(Self::zeroed()) } /// horizontal add of all the elements of the vector #[inline] #[must_use] pub fn reduce_add(self) -> i16 { pick! { if #[cfg(target_feature="sse2")] { // there is a horizontal add instruction on ssse3, but apparently it is very slow on some AMD CPUs let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse); let sum64 = add_i16_m128i(self.sse, hi64); let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64); let sum32 = add_i16_m128i(sum64, hi32); let lo16 = shr_imm_u32_m128i::<16>(sum32); let sum16 = add_i16_m128i(sum32, lo16); extract_i16_as_i32_m128i::<0>(sum16) as i16 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { vaddvq_s16(self.neon) } } else { let arr: [i16; 8] = cast(self); // most boring implementation possible so optimizer doesn't overthink this let mut r = arr[0]; r = r.wrapping_add(arr[1]); r = r.wrapping_add(arr[2]); r = r.wrapping_add(arr[3]); r = r.wrapping_add(arr[4]); r = r.wrapping_add(arr[5]); r = r.wrapping_add(arr[6]); r.wrapping_add(arr[7]) } } } /// horizontal min of all the elements of the vector #[inline] #[must_use] pub fn reduce_min(self) -> i16 { pick! { if #[cfg(target_feature="sse2")] { let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse); let sum64 = min_i16_m128i(self.sse, hi64); let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64); let sum32 = min_i16_m128i(sum64, hi32); let lo16 = shr_imm_u32_m128i::<16>(sum32); let sum16 = min_i16_m128i(sum32, lo16); extract_i16_as_i32_m128i::<0>(sum16) as i16 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { vminvq_s16(self.neon) } } else { let arr: [i16; 8] = cast(self); // most boring implementation possible so optimizer doesn't overthink this let mut r = arr[0]; r = r.min(arr[1]); r = r.min(arr[2]); r = r.min(arr[3]); r = r.min(arr[4]); r = r.min(arr[5]); r = r.min(arr[6]); r.min(arr[7]) } } } /// horizontal max of all the elements of the vector #[inline] #[must_use] pub fn reduce_max(self) -> i16 { pick! { if #[cfg(target_feature="sse2")] { let hi64 = shuffle_ai_f32_all_m128i::<0b01_00_11_10>(self.sse); let sum64 = max_i16_m128i(self.sse, hi64); let hi32 = shuffle_ai_f32_all_m128i::<0b11_10_00_01>(sum64); let sum32 = max_i16_m128i(sum64, hi32); let lo16 = shr_imm_u32_m128i::<16>(sum32); let sum16 = max_i16_m128i(sum32, lo16); extract_i16_as_i32_m128i::<0>(sum16) as i16 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { vmaxvq_s16(self.neon) } } else { let arr: [i16; 8] = cast(self); // most boring implementation possible so optimizer doesn't overthink this let mut r = arr[0]; r = r.max(arr[1]); r = r.max(arr[2]); r = r.max(arr[3]); r = r.max(arr[4]); r = r.max(arr[5]); r = r.max(arr[6]); r.max(arr[7]) } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="sse2")] { let mask = shr_imm_i16_m128i::<15>(self.sse); Self { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) } } else if #[cfg(target_feature="ssse3")] { Self { sse: abs_i16_m128i(self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vabsq_s16(self.neon) }} } else { let arr: [i16; 8] = cast(self); cast( [ arr[0].wrapping_abs(), arr[1].wrapping_abs(), arr[2].wrapping_abs(), arr[3].wrapping_abs(), arr[4].wrapping_abs(), arr[5].wrapping_abs(), arr[6].wrapping_abs(), arr[7].wrapping_abs(), ]) } } } #[inline] #[must_use] pub fn unsigned_abs(self) -> u16x8 { pick! { if #[cfg(target_feature="sse2")] { let mask = shr_imm_i16_m128i::<15>(self.sse); u16x8 { sse: bitxor_m128i(add_i16_m128i(self.sse, mask), mask) } } else if #[cfg(target_feature="ssse3")] { u16x8 { sse: abs_i16_m128i(self.sse) } } else if #[cfg(target_feature="simd128")] { u16x8 { simd: i16x8_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {u16x8 { neon: vreinterpretq_u16_s16(vabsq_s16(self.neon)) }} } else { let arr: [i16; 8] = cast(self); cast( [ arr[0].unsigned_abs(), arr[1].unsigned_abs(), arr[2].unsigned_abs(), arr[3].unsigned_abs(), arr[4].unsigned_abs(), arr[5].unsigned_abs(), arr[6].unsigned_abs(), arr[7].unsigned_abs(), ]) } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: max_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_max(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxq_s16(self.neon, rhs.neon) }} } else { self.cmp_lt(rhs).blend(rhs, self) } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: min_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_min(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminq_s16(self.neon, rhs.neon) }} } else { self.cmp_lt(rhs).blend(self, rhs) } } } #[inline] #[must_use] pub fn saturating_add(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_saturating_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_add_sat(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vqaddq_s16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].saturating_add(rhs.arr[0]), self.arr[1].saturating_add(rhs.arr[1]), self.arr[2].saturating_add(rhs.arr[2]), self.arr[3].saturating_add(rhs.arr[3]), self.arr[4].saturating_add(rhs.arr[4]), self.arr[5].saturating_add(rhs.arr[5]), self.arr[6].saturating_add(rhs.arr[6]), self.arr[7].saturating_add(rhs.arr[7]), ]} } } } #[inline] #[must_use] pub fn saturating_sub(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_saturating_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_sub_sat(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vqsubq_s16(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].saturating_sub(rhs.arr[0]), self.arr[1].saturating_sub(rhs.arr[1]), self.arr[2].saturating_sub(rhs.arr[2]), self.arr[3].saturating_sub(rhs.arr[3]), self.arr[4].saturating_sub(rhs.arr[4]), self.arr[5].saturating_sub(rhs.arr[5]), self.arr[6].saturating_sub(rhs.arr[6]), self.arr[7].saturating_sub(rhs.arr[7]), ]} } } } /// Calculates partial dot product. /// Multiplies packed signed 16-bit integers, producing intermediate signed /// 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit /// integers. #[inline] #[must_use] pub fn dot(self, rhs: Self) -> i32x4 { pick! { if #[cfg(target_feature="sse2")] { i32x4 { sse: mul_i16_horizontal_add_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { i32x4 { simd: i32x4_dot_i16x8(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { let pl = vmull_s16(vget_low_s16(self.neon), vget_low_s16(rhs.neon)); let ph = vmull_high_s16(self.neon, rhs.neon); i32x4 { neon: vpaddq_s32(pl, ph) } } } else { i32x4 { arr: [ (i32::from(self.arr[0]) * i32::from(rhs.arr[0])) + (i32::from(self.arr[1]) * i32::from(rhs.arr[1])), (i32::from(self.arr[2]) * i32::from(rhs.arr[2])) + (i32::from(self.arr[3]) * i32::from(rhs.arr[3])), (i32::from(self.arr[4]) * i32::from(rhs.arr[4])) + (i32::from(self.arr[5]) * i32::from(rhs.arr[5])), (i32::from(self.arr[6]) * i32::from(rhs.arr[6])) + (i32::from(self.arr[7]) * i32::from(rhs.arr[7])), ] } } } } /// Multiply and scale equivalent to `((self * rhs) + 0x4000) >> 15` on each /// lane, effectively multiplying by a 16 bit fixed point number between `-1` /// and `1`. This corresponds to the following instructions: /// - `vqrdmulhq_s16` instruction on neon /// - `i16x8_q15mulr_sat` on simd128 /// - `_mm_mulhrs_epi16` on ssse3 /// - emulated via `mul_i16_*` on sse2 #[inline] #[must_use] pub fn mul_scale_round(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="ssse3")] { Self { sse: mul_i16_scale_round_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="sse2")] { // unfortunately mul_i16_scale_round_m128i only got added in sse3 let hi = mul_i16_keep_high_m128i(self.sse, rhs.sse); let lo = mul_i16_keep_low_m128i(self.sse, rhs.sse); let mut v1 = unpack_low_i16_m128i(lo, hi); let mut v2 = unpack_high_i16_m128i(lo, hi); let a = set_splat_i32_m128i(0x4000); v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a)); v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a)); let s = pack_i32_to_i16_m128i(v1, v2); Self { sse: s } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_q15mulr_sat(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vqrdmulhq_s16(self.neon, rhs.neon) } } } else { // compiler does a surprisingly good job of vectorizing this Self { arr: [ ((i32::from(self.arr[0]) * i32::from(rhs.arr[0]) + 0x4000) >> 15) as i16, ((i32::from(self.arr[1]) * i32::from(rhs.arr[1]) + 0x4000) >> 15) as i16, ((i32::from(self.arr[2]) * i32::from(rhs.arr[2]) + 0x4000) >> 15) as i16, ((i32::from(self.arr[3]) * i32::from(rhs.arr[3]) + 0x4000) >> 15) as i16, ((i32::from(self.arr[4]) * i32::from(rhs.arr[4]) + 0x4000) >> 15) as i16, ((i32::from(self.arr[5]) * i32::from(rhs.arr[5]) + 0x4000) >> 15) as i16, ((i32::from(self.arr[6]) * i32::from(rhs.arr[6]) + 0x4000) >> 15) as i16, ((i32::from(self.arr[7]) * i32::from(rhs.arr[7]) + 0x4000) >> 15) as i16, ]} } } } /// Multiples two `i16x8` and return the high part of intermediate `i32x8` #[inline] #[must_use] pub fn mul_keep_high(lhs: Self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: mul_i16_keep_high_m128i(lhs.sse, rhs.sse) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { let lhs_low = unsafe { vget_low_s16(lhs.neon) }; let rhs_low = unsafe { vget_low_s16(rhs.neon) }; let lhs_high = unsafe { vget_high_s16(lhs.neon) }; let rhs_high = unsafe { vget_high_s16(rhs.neon) }; let low = unsafe { vmull_s16(lhs_low, rhs_low) }; let high = unsafe { vmull_s16(lhs_high, rhs_high) }; i16x8 { neon: unsafe { vreinterpretq_s16_u16(vuzpq_u16(vreinterpretq_u16_s32(low), vreinterpretq_u16_s32(high)).1) } } } else if #[cfg(target_feature="simd128")] { let low = i32x4_extmul_low_i16x8(lhs.simd, rhs.simd); let high = i32x4_extmul_high_i16x8(lhs.simd, rhs.simd); Self { simd: i16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(low, high) } } else { i16x8::new([ ((i32::from(rhs.as_array_ref()[0]) * i32::from(lhs.as_array_ref()[0])) >> 16) as i16, ((i32::from(rhs.as_array_ref()[1]) * i32::from(lhs.as_array_ref()[1])) >> 16) as i16, ((i32::from(rhs.as_array_ref()[2]) * i32::from(lhs.as_array_ref()[2])) >> 16) as i16, ((i32::from(rhs.as_array_ref()[3]) * i32::from(lhs.as_array_ref()[3])) >> 16) as i16, ((i32::from(rhs.as_array_ref()[4]) * i32::from(lhs.as_array_ref()[4])) >> 16) as i16, ((i32::from(rhs.as_array_ref()[5]) * i32::from(lhs.as_array_ref()[5])) >> 16) as i16, ((i32::from(rhs.as_array_ref()[6]) * i32::from(lhs.as_array_ref()[6])) >> 16) as i16, ((i32::from(rhs.as_array_ref()[7]) * i32::from(lhs.as_array_ref()[7])) >> 16) as i16, ]) } } } /// multiplies two `i16x8` and returns the result as a widened `i32x8` #[inline] #[must_use] pub fn mul_widen(self, rhs: Self) -> i32x8 { pick! { if #[cfg(target_feature="avx2")] { let a = convert_to_i32_m256i_from_i16_m128i(self.sse); let b = convert_to_i32_m256i_from_i16_m128i(rhs.sse); i32x8 { avx2: mul_i32_keep_low_m256i(a,b) } } else if #[cfg(target_feature="sse2")] { let low = mul_i16_keep_low_m128i(self.sse, rhs.sse); let high = mul_i16_keep_high_m128i(self.sse, rhs.sse); i32x8 { a: i32x4 { sse:unpack_low_i16_m128i(low, high) }, b: i32x4 { sse:unpack_high_i16_m128i(low, high) } } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { let lhs_low = unsafe { vget_low_s16(self.neon) }; let rhs_low = unsafe { vget_low_s16(rhs.neon) }; let lhs_high = unsafe { vget_high_s16(self.neon) }; let rhs_high = unsafe { vget_high_s16(rhs.neon) }; let low = unsafe { vmull_s16(lhs_low, rhs_low) }; let high = unsafe { vmull_s16(lhs_high, rhs_high) }; i32x8 { a: i32x4 { neon: low }, b: i32x4 {neon: high } } } else { let a = self.as_array_ref(); let b = rhs.as_array_ref(); i32x8::new([ i32::from(a[0]) * i32::from(b[0]), i32::from(a[1]) * i32::from(b[1]), i32::from(a[2]) * i32::from(b[2]), i32::from(a[3]) * i32::from(b[3]), i32::from(a[4]) * i32::from(b[4]), i32::from(a[5]) * i32::from(b[5]), i32::from(a[6]) * i32::from(b[6]), i32::from(a[7]) * i32::from(b[7]), ]) } } } /// transpose matrix of 8x8 i16 matrix #[must_use] #[inline] pub fn transpose(data: [i16x8; 8]) -> [i16x8; 8] { pick! { if #[cfg(target_feature="sse2")] { let a1 = unpack_low_i16_m128i(data[0].sse, data[1].sse); let a2 = unpack_high_i16_m128i(data[0].sse, data[1].sse); let a3 = unpack_low_i16_m128i(data[2].sse, data[3].sse); let a4 = unpack_high_i16_m128i(data[2].sse, data[3].sse); let a5 = unpack_low_i16_m128i(data[4].sse, data[5].sse); let a6 = unpack_high_i16_m128i(data[4].sse, data[5].sse); let a7 = unpack_low_i16_m128i(data[6].sse, data[7].sse); let a8 = unpack_high_i16_m128i(data[6].sse, data[7].sse); let b1 = unpack_low_i32_m128i(a1, a3); let b2 = unpack_high_i32_m128i(a1, a3); let b3 = unpack_low_i32_m128i(a2, a4); let b4 = unpack_high_i32_m128i(a2, a4); let b5 = unpack_low_i32_m128i(a5, a7); let b6 = unpack_high_i32_m128i(a5, a7); let b7 = unpack_low_i32_m128i(a6, a8); let b8 = unpack_high_i32_m128i(a6, a8); [ i16x8 { sse: unpack_low_i64_m128i(b1, b5) }, i16x8 { sse: unpack_high_i64_m128i(b1, b5) }, i16x8 { sse: unpack_low_i64_m128i(b2, b6) }, i16x8 { sse: unpack_high_i64_m128i(b2, b6) }, i16x8 { sse: unpack_low_i64_m128i(b3, b7) }, i16x8 { sse: unpack_high_i64_m128i(b3, b7) }, i16x8 { sse: unpack_low_i64_m128i(b4, b8) }, i16x8 { sse: unpack_high_i64_m128i(b4, b8) } , ] } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ #[inline] fn vtrq32(a : int16x8_t, b : int16x8_t) -> (int16x8_t, int16x8_t) { unsafe { let r = vtrnq_s32(vreinterpretq_s32_s16(a),vreinterpretq_s32_s16(b)); (vreinterpretq_s16_s32(r.0), vreinterpretq_s16_s32(r.1)) } } unsafe { let (q0,q2) = vtrq32(data[0].neon, data[2].neon); let (q1,q3) = vtrq32(data[1].neon, data[3].neon); let (q4,q6) = vtrq32(data[4].neon, data[6].neon); let (q5,q7) = vtrq32(data[5].neon, data[7].neon); let b1 = vtrnq_s16(q0, q1); let b2 = vtrnq_s16(q2, q3); let b3 = vtrnq_s16(q4, q5); let b4 = vtrnq_s16(q6, q7); // There is no vtrnq_s64 unfortunately, so there's this mess // which does a somewhat reasonable job, but not as good as the // assembly versions which just swap the 64 bit register aliases. [ i16x8 { neon: vcombine_s16(vget_low_s16(b1.0), vget_low_s16(b3.0)) }, i16x8 { neon: vcombine_s16(vget_low_s16(b1.1), vget_low_s16(b3.1)) }, i16x8 { neon: vcombine_s16(vget_low_s16(b2.0), vget_low_s16(b4.0)) }, i16x8 { neon: vcombine_s16(vget_low_s16(b2.1), vget_low_s16(b4.1)) }, i16x8 { neon: vcombine_s16(vget_high_s16(b1.0), vget_high_s16(b3.0)) }, i16x8 { neon: vcombine_s16(vget_high_s16(b1.1), vget_high_s16(b3.1)) }, i16x8 { neon: vcombine_s16(vget_high_s16(b2.0), vget_high_s16(b4.0)) }, i16x8 { neon: vcombine_s16(vget_high_s16(b2.1), vget_high_s16(b4.1)) }, ] } } else if #[cfg(target_feature="simd128")] { #[inline] fn lo_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(a,b) } #[inline] fn hi_i16(a : v128, b : v128) -> v128 { i16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(a,b) } #[inline] fn lo_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<0, 4, 1, 5>(a,b) } #[inline] fn hi_i32(a : v128, b : v128) -> v128 { i32x4_shuffle::<2, 6, 3, 7>(a,b) } #[inline] fn lo_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<0, 2>(a,b) } #[inline] fn hi_i64(a : v128, b : v128) -> v128 { i64x2_shuffle::<1, 3>(a,b) } let a1 = lo_i16(data[0].simd, data[1].simd); let a2 = hi_i16(data[0].simd, data[1].simd); let a3 = lo_i16(data[2].simd, data[3].simd); let a4 = hi_i16(data[2].simd, data[3].simd); let a5 = lo_i16(data[4].simd, data[5].simd); let a6 = hi_i16(data[4].simd, data[5].simd); let a7 = lo_i16(data[6].simd, data[7].simd); let a8 = hi_i16(data[6].simd, data[7].simd); let b1 = lo_i32(a1, a3); let b2 = hi_i32(a1, a3); let b3 = lo_i32(a2, a4); let b4 = hi_i32(a2, a4); let b5 = lo_i32(a5, a7); let b6 = hi_i32(a5, a7); let b7 = lo_i32(a6, a8); let b8 = hi_i32(a6, a8); [ i16x8 { simd: lo_i64(b1, b5) }, i16x8 { simd: hi_i64(b1, b5) }, i16x8 { simd: lo_i64(b2, b6) }, i16x8 { simd: hi_i64(b2, b6) }, i16x8 { simd: lo_i64(b3, b7) }, i16x8 { simd: hi_i64(b3, b7) }, i16x8 { simd: lo_i64(b4, b8) }, i16x8 { simd: hi_i64(b4, b8) } , ] } else { #[inline(always)] fn transpose_column(data: &[i16x8; 8], index: usize) -> i16x8 { i16x8::new([ data[0].as_array_ref()[index], data[1].as_array_ref()[index], data[2].as_array_ref()[index], data[3].as_array_ref()[index], data[4].as_array_ref()[index], data[5].as_array_ref()[index], data[6].as_array_ref()[index], data[7].as_array_ref()[index], ]) } [ transpose_column(&data, 0), transpose_column(&data, 1), transpose_column(&data, 2), transpose_column(&data, 3), transpose_column(&data, 4), transpose_column(&data, 5), transpose_column(&data, 6), transpose_column(&data, 7), ] } } } #[inline] #[must_use] /// Multiply and scale, equivalent to `((self * rhs) + 0x4000) >> 15` on each /// lane, effectively multiplying by a 16 bit fixed point number between `-1` /// and `1`. This corresponds to the following instructions: /// - `vqrdmulhq_n_s16` instruction on neon /// - `i16x8_q15mulr_sat` on simd128 /// - `_mm_mulhrs_epi16` on ssse3 /// - emulated via `mul_i16_*` on sse2 pub fn mul_scale_round_n(self, rhs: i16) -> Self { pick! { if #[cfg(target_feature="ssse3")] { Self { sse: mul_i16_scale_round_m128i(self.sse, set_splat_i16_m128i(rhs)) } } else if #[cfg(target_feature="sse2")] { // unfortunately mul_i16_scale_round_m128i only got added in sse3 let r = set_splat_i16_m128i(rhs); let hi = mul_i16_keep_high_m128i(self.sse, r); let lo = mul_i16_keep_low_m128i(self.sse, r); let mut v1 = unpack_low_i16_m128i(lo, hi); let mut v2 = unpack_high_i16_m128i(lo, hi); let a = set_splat_i32_m128i(0x4000); v1 = shr_imm_i32_m128i::<15>(add_i32_m128i(v1, a)); v2 = shr_imm_i32_m128i::<15>(add_i32_m128i(v2, a)); let s = pack_i32_to_i16_m128i(v1, v2); Self { sse: s } } else if #[cfg(target_feature="simd128")] { Self { simd: i16x8_q15mulr_sat(self.simd, i16x8_splat(rhs)) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vqrdmulhq_n_s16(self.neon, rhs) } } } else { // compiler does a surprisingly good job of vectorizing this Self { arr: [ ((i32::from(self.arr[0]) * i32::from(rhs) + 0x4000) >> 15) as i16, ((i32::from(self.arr[1]) * i32::from(rhs) + 0x4000) >> 15) as i16, ((i32::from(self.arr[2]) * i32::from(rhs) + 0x4000) >> 15) as i16, ((i32::from(self.arr[3]) * i32::from(rhs) + 0x4000) >> 15) as i16, ((i32::from(self.arr[4]) * i32::from(rhs) + 0x4000) >> 15) as i16, ((i32::from(self.arr[5]) * i32::from(rhs) + 0x4000) >> 15) as i16, ((i32::from(self.arr[6]) * i32::from(rhs) + 0x4000) >> 15) as i16, ((i32::from(self.arr[7]) * i32::from(rhs) + 0x4000) >> 15) as i16, ]} } } } #[inline] pub fn to_array(self) -> [i16; 8] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[i16; 8] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [i16; 8] { cast_mut(self) } } wide-0.7.32/src/i32x4_.rs000066400000000000000000000541211473735473700147270ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct i32x4 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct i32x4 { pub(crate) simd: v128 } impl Default for i32x4 { fn default() -> Self { Self::splat(0) } } impl PartialEq for i32x4 { fn eq(&self, other: &Self) -> bool { u32x4_all_true(i32x4_eq(self.simd, other.simd)) } } impl Eq for i32x4 { } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct i32x4 { pub(crate) neon : int32x4_t } impl Default for i32x4 { #[inline] #[must_use] fn default() -> Self { Self::splat(0) } } impl PartialEq for i32x4 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { vminvq_u32(vceqq_s32(self.neon, other.neon))==u32::MAX } } } impl Eq for i32x4 { } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct i32x4 { pub(crate) arr: [i32;4] } } } int_uint_consts!(i32, 4, i32x4, 128); unsafe impl Zeroable for i32x4 {} unsafe impl Pod for i32x4 {} impl Add for i32x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_s32(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_add(rhs.arr[0]), self.arr[1].wrapping_add(rhs.arr[1]), self.arr[2].wrapping_add(rhs.arr[2]), self.arr[3].wrapping_add(rhs.arr[3]), ]} } } } } impl Sub for i32x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vsubq_s32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].wrapping_sub(rhs.arr[0]), self.arr[1].wrapping_sub(rhs.arr[1]), self.arr[2].wrapping_sub(rhs.arr[2]), self.arr[3].wrapping_sub(rhs.arr[3]), ]} } } } } impl Mul for i32x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: mul_32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_mul(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmulq_s32(self.neon, rhs.neon) }} } else { let arr1: [i32; 4] = cast(self); let arr2: [i32; 4] = cast(rhs); cast([ arr1[0].wrapping_mul(arr2[0]), arr1[1].wrapping_mul(arr2[1]), arr1[2].wrapping_mul(arr2[2]), arr1[3].wrapping_mul(arr2[3]), ]) } } } } impl Add for i32x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: i32) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for i32x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: i32) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for i32x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: i32) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for i32 { type Output = i32x4; #[inline] #[must_use] fn add(self, rhs: i32x4) -> Self::Output { i32x4::splat(self).add(rhs) } } impl Sub for i32 { type Output = i32x4; #[inline] #[must_use] fn sub(self, rhs: i32x4) -> Self::Output { i32x4::splat(self).sub(rhs) } } impl Mul for i32 { type Output = i32x4; #[inline] #[must_use] fn mul(self, rhs: i32x4) -> Self::Output { i32x4::splat(self).mul(rhs) } } impl BitAnd for i32x4 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitand_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vandq_s32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitand(rhs.arr[0]), self.arr[1].bitand(rhs.arr[1]), self.arr[2].bitand(rhs.arr[2]), self.arr[3].bitand(rhs.arr[3]), ]} } } } } impl BitOr for i32x4 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vorrq_s32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitor(rhs.arr[0]), self.arr[1].bitor(rhs.arr[1]), self.arr[2].bitor(rhs.arr[2]), self.arr[3].bitor(rhs.arr[3]), ]} } } } } impl BitXor for i32x4 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitxor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: veorq_s32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitxor(rhs.arr[0]), self.arr[1].bitxor(rhs.arr[1]), self.arr[2].bitxor(rhs.arr[2]), self.arr[3].bitxor(rhs.arr[3]), ]} } } } } macro_rules! impl_shl_t_for_i32x4 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for i32x4 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shl_all_u32_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_shl(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_s32(self.neon, vmovq_n_s32(rhs as i32)) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] << u, self.arr[1] << u, self.arr[2] << u, self.arr[3] << u, ]} } } } })+ }; } impl_shl_t_for_i32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_i32x4 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for i32x4 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shr_all_i32_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_shr(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_s32(self.neon, vmovq_n_s32( -(rhs as i32))) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] >> u, self.arr[1] >> u, self.arr[2] >> u, self.arr[3] >> u, ]} } } } })+ }; } impl_shr_t_for_i32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); /// Shifts lanes by the corresponding lane. /// /// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth /// of the type. (same as `wrapping_shr`) impl Shr for i32x4 { type Output = Self; #[inline] #[must_use] fn shr(self, rhs: i32x4) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { // mask the shift count to 31 to have same behavior on all platforms let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31)); Self { sse: shr_each_i32_m128i(self.sse, shift_by) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { // mask the shift count to 31 to have same behavior on all platforms // no right shift, have to pass negative value to left shift on neon let shift_by = vnegq_s32(vandq_s32(rhs.neon, vmovq_n_s32(31))); Self { neon: vshlq_s32(self.neon, shift_by) } } } else { let arr: [i32; 4] = cast(self); let rhs: [i32; 4] = cast(rhs); cast([ arr[0].wrapping_shr(rhs[0] as u32), arr[1].wrapping_shr(rhs[1] as u32), arr[2].wrapping_shr(rhs[2] as u32), arr[3].wrapping_shr(rhs[3] as u32), ]) } } } } /// Shifts lanes by the corresponding lane. /// /// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth /// of the type. (same as `wrapping_shl`) impl Shl for i32x4 { type Output = Self; #[inline] #[must_use] fn shl(self, rhs: i32x4) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { // mask the shift count to 31 to have same behavior on all platforms let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31)); Self { sse: shl_each_u32_m128i(self.sse, shift_by) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { // mask the shift count to 31 to have same behavior on all platforms let shift_by = vandq_s32(rhs.neon, vmovq_n_s32(31)); Self { neon: vshlq_s32(self.neon, shift_by) } } } else { let arr: [i32; 4] = cast(self); let rhs: [i32; 4] = cast(rhs); cast([ arr[0].wrapping_shl(rhs[0] as u32), arr[1].wrapping_shl(rhs[1] as u32), arr[2].wrapping_shl(rhs[2] as u32), arr[3].wrapping_shl(rhs[3] as u32), ]) } } } } impl CmpEq for i32x4 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_eq_mask_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s32_u32(vceqq_s32(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] == rhs.arr[0] { -1 } else { 0 }, if self.arr[1] == rhs.arr[1] { -1 } else { 0 }, if self.arr[2] == rhs.arr[2] { -1 } else { 0 }, if self.arr[3] == rhs.arr[3] { -1 } else { 0 }, ]} } } } } impl CmpGt for i32x4 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_gt_mask_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_gt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s32_u32(vcgtq_s32(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] > rhs.arr[0] { -1 } else { 0 }, if self.arr[1] > rhs.arr[1] { -1 } else { 0 }, if self.arr[2] > rhs.arr[2] { -1 } else { 0 }, if self.arr[3] > rhs.arr[3] { -1 } else { 0 }, ]} } } } } impl CmpLt for i32x4 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_lt_mask_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_lt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s32_u32(vcltq_s32(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] < rhs.arr[0] { -1 } else { 0 }, if self.arr[1] < rhs.arr[1] { -1 } else { 0 }, if self.arr[2] < rhs.arr[2] { -1 } else { 0 }, if self.arr[3] < rhs.arr[3] { -1 } else { 0 }, ]} } } } } impl i32x4 { #[inline] #[must_use] pub const fn new(array: [i32; 4]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vbslq_s32(vreinterpretq_u32_s32(self.neon), t.neon, f.neon) }} } else { generic_bit_blend(self, t, f) } } } /// Multiplies corresponding 32 bit lanes and returns the 64 bit result /// on the corresponding lanes. /// /// Effectively does two multiplies on 128 bit platforms, but is easier /// to use than wrapping mul_widen_i32_odd_m128i individually. #[inline] #[must_use] pub fn mul_widen(self, rhs: Self) -> i64x4 { pick! { if #[cfg(target_feature="avx2")] { let a = convert_to_i64_m256i_from_i32_m128i(self.sse); let b = convert_to_i64_m256i_from_i32_m128i(rhs.sse); cast(mul_i64_low_bits_m256i(a, b)) } else if #[cfg(target_feature="sse4.1")] { let evenp = mul_widen_i32_odd_m128i(self.sse, rhs.sse); let oddp = mul_widen_i32_odd_m128i( shr_imm_u64_m128i::<32>(self.sse), shr_imm_u64_m128i::<32>(rhs.sse)); i64x4 { a: i64x2 { sse: unpack_low_i64_m128i(evenp, oddp)}, b: i64x2 { sse: unpack_high_i64_m128i(evenp, oddp)} } } else if #[cfg(target_feature="simd128")] { i64x4 { a: i64x2 { simd: i64x2_extmul_low_i32x4(self.simd, rhs.simd) }, b: i64x2 { simd: i64x2_extmul_high_i32x4(self.simd, rhs.simd) }, } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe { i64x4 { a: i64x2 { neon: vmull_s32(vget_low_s32(self.neon), vget_low_s32(rhs.neon)) }, b: i64x2 { neon: vmull_s32(vget_high_s32(self.neon), vget_high_s32(rhs.neon)) } } } } else { let a: [i32; 4] = cast(self); let b: [i32; 4] = cast(rhs); cast([ i64::from(a[0]) * i64::from(b[0]), i64::from(a[1]) * i64::from(b[1]), i64::from(a[2]) * i64::from(b[2]), i64::from(a[3]) * i64::from(b[3]), ]) } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="ssse3")] { Self { sse: abs_i32_m128i(self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vabsq_s32(self.neon) }} } else { let arr: [i32; 4] = cast(self); cast([ arr[0].wrapping_abs(), arr[1].wrapping_abs(), arr[2].wrapping_abs(), arr[3].wrapping_abs(), ]) } } } #[inline] #[must_use] pub fn unsigned_abs(self) -> u32x4 { pick! { if #[cfg(target_feature="ssse3")] { u32x4 { sse: abs_i32_m128i(self.sse) } } else if #[cfg(target_feature="simd128")] { u32x4 { simd: i32x4_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {u32x4 { neon: vreinterpretq_u32_s32(vabsq_s32(self.neon)) }} } else { let arr: [i32; 4] = cast(self); cast([ arr[0].unsigned_abs(), arr[1].unsigned_abs(), arr[2].unsigned_abs(), arr[3].unsigned_abs(), ]) } } } /// horizontal add of all the elements of the vector #[inline] #[must_use] pub fn reduce_add(self) -> i32 { pick! { if #[cfg(target_feature="sse2")] { let hi64 = unpack_high_i64_m128i(self.sse, self.sse); let sum64 = add_i32_m128i(hi64, self.sse); let hi32 = shuffle_ai_f32_all_m128i::<0b10_11_00_01>(sum64); // Swap the low two elements let sum32 = add_i32_m128i(sum64, hi32); get_i32_from_m128i_s(sum32) } else { let arr: [i32; 4] = cast(self); arr[0].wrapping_add(arr[1]).wrapping_add( arr[2].wrapping_add(arr[3])) } } } /// horizontal max of all the elements of the vector #[inline] #[must_use] pub fn reduce_max(self) -> i32 { let arr: [i32; 4] = cast(self); arr[0].max(arr[1]).max(arr[2].max(arr[3])) } /// horizontal min of all the elements of the vector #[inline] #[must_use] pub fn reduce_min(self) -> i32 { let arr: [i32; 4] = cast(self); arr[0].min(arr[1]).min(arr[2].min(arr[3])) } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: max_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_max(self.simd, rhs.simd) } } else { self.cmp_lt(rhs).blend(rhs, self) } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: min_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i32x4_min(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminq_s32(self.neon, rhs.neon) }} } else { self.cmp_lt(rhs).blend(self, rhs) } } } #[inline] #[must_use] pub fn round_float(self) -> f32x4 { pick! { if #[cfg(target_feature="sse2")] { cast(convert_to_m128_from_i32_m128i(self.sse)) } else if #[cfg(target_feature="simd128")] { cast(Self { simd: f32x4_convert_i32x4(self.simd) }) } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ cast(unsafe {Self { neon: vreinterpretq_s32_f32(vcvtq_f32_s32(self.neon)) }}) } else { let arr: [i32; 4] = cast(self); cast([ arr[0] as f32, arr[1] as f32, arr[2] as f32, arr[3] as f32, ]) } } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="sse2")] { // use f32 move_mask since it is the same size as i32 move_mask_m128(cast(self.sse)) } else if #[cfg(target_feature="simd128")] { u32x4_bitmask(self.simd) as i32 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { // set all to 1 if top bit is set, else 0 let masked = vcltq_s32(self.neon, vdupq_n_s32(0)); // select the right bit out of each lane let selectbit : uint32x4_t = core::intrinsics::transmute([1u32, 2, 4, 8]); let r = vandq_u32(masked, selectbit); // horizontally add the 32-bit lanes vaddvq_u32(r) as i32 } } else { ((self.arr[0] < 0) as i32) << 0 | ((self.arr[1] < 0) as i32) << 1 | ((self.arr[2] < 0) as i32) << 2 | ((self.arr[3] < 0) as i32) << 3 } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="sse2")] { // use f32 move_mask since it is the same size as i32 move_mask_m128(cast(self.sse)) != 0 } else if #[cfg(target_feature="simd128")] { u32x4_bitmask(self.simd) != 0 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { // some lanes are negative unsafe { vminvq_s32(self.neon) < 0 } } else { let v : [u64;2] = cast(self); ((v[0] | v[1]) & 0x8000000080000000) != 0 } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="sse2")] { // use f32 move_mask since it is the same size as i32 move_mask_m128(cast(self.sse)) == 0b1111 } else if #[cfg(target_feature="simd128")] { u32x4_bitmask(self.simd) == 0b1111 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ // all lanes are negative unsafe { vmaxvq_s32(self.neon) < 0 } } else { let v : [u64;2] = cast(self); (v[0] & v[1] & 0x8000000080000000) == 0x8000000080000000 } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] pub fn to_array(self) -> [i32; 4] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[i32; 4] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [i32; 4] { cast_mut(self) } } wide-0.7.32/src/i32x8_.rs000066400000000000000000000403161473735473700147340ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct i32x8 { pub(crate) avx2: m256i } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct i32x8 { pub(crate) a : i32x4, pub(crate) b : i32x4} } } int_uint_consts!(i32, 8, i32x8, 256); unsafe impl Zeroable for i32x8 {} unsafe impl Pod for i32x8 {} impl Add for i32x8 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: add_i32_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.add(rhs.a), b : self.b.add(rhs.b), } } } } } impl Sub for i32x8 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: sub_i32_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.sub(rhs.a), b : self.b.sub(rhs.b), } } } } } impl Mul for i32x8 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: mul_i32_keep_low_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.mul(rhs.a), b : self.b.mul(rhs.b), } } } } } impl Add for i32x8 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: i32) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for i32x8 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: i32) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for i32x8 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: i32) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for i32 { type Output = i32x8; #[inline] #[must_use] fn add(self, rhs: i32x8) -> Self::Output { i32x8::splat(self) + rhs } } impl Sub for i32 { type Output = i32x8; #[inline] #[must_use] fn sub(self, rhs: i32x8) -> Self::Output { i32x8::splat(self) - rhs } } impl Mul for i32 { type Output = i32x8; #[inline] #[must_use] fn mul(self, rhs: i32x8) -> Self::Output { i32x8::splat(self) * rhs } } impl BitAnd for i32x8 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitand_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitand(rhs.a), b : self.b.bitand(rhs.b), } } } } } impl BitOr for i32x8 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitor(rhs.a), b : self.b.bitor(rhs.b), } } } } } impl BitXor for i32x8 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitxor(rhs.a), b : self.b.bitxor(rhs.b), } } } } } macro_rules! impl_shl_t_for_i32x8 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for i32x8 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shl_all_u32_m256i(self.avx2, shift) } } else { Self { a : self.a.shl(rhs), b : self.b.shl(rhs), } } } } })+ }; } impl_shl_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_i32x8 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for i32x8 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shr_all_i32_m256i(self.avx2, shift) } } else { Self { a : self.a.shr(rhs), b : self.b.shr(rhs), } } } } })+ }; } impl_shr_t_for_i32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); /// Shifts lanes by the corresponding lane. /// /// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth /// of the type. (same as `wrapping_shr`) impl Shr for i32x8 { type Output = Self; #[inline] #[must_use] fn shr(self, rhs: i32x8) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { // ensure same behavior as scalar let shift_by = bitand_m256i(rhs.avx2, set_splat_i32_m256i(31)); Self { avx2: shr_each_i32_m256i(self.avx2, shift_by ) } } else { Self { a : self.a.shr(rhs.a), b : self.b.shr(rhs.b), } } } } } /// Shifts lanes by the corresponding lane. /// /// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth /// of the type. (same as `wrapping_shl`) impl Shl for i32x8 { type Output = Self; #[inline] #[must_use] fn shl(self, rhs: i32x8) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { // ensure same behavior as scalar wrapping_shl by masking the shift count let shift_by = bitand_m256i(rhs.avx2, set_splat_i32_m256i(31)); // shl is the same for unsigned and signed Self { avx2: shl_each_u32_m256i(self.avx2, shift_by) } } else { Self { a : self.a.shl(rhs.a), b : self.b.shl(rhs.b), } } } } } impl CmpEq for i32x8 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: cmp_eq_mask_i32_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.cmp_eq(rhs.a), b : self.b.cmp_eq(rhs.b), } } } } } impl CmpGt for i32x8 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: cmp_gt_mask_i32_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.cmp_gt(rhs.a), b : self.b.cmp_gt(rhs.b), } } } } } impl CmpLt for i32x8 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: !cmp_gt_mask_i32_m256i(self.avx2, rhs.avx2) ^ cmp_eq_mask_i32_m256i(self.avx2,rhs.avx2) } } else { Self { a : self.a.cmp_lt(rhs.a), b : self.b.cmp_lt(rhs.b), } } } } } impl From for i32x8 { #[inline] #[must_use] fn from(value: i16x8) -> Self { i32x8::from_i16x8(value) } } impl i32x8 { #[inline] #[must_use] pub const fn new(array: [i32; 8]) -> Self { unsafe { core::intrinsics::transmute(array) } } /// widens and sign extends to `i32x8` #[inline] #[must_use] pub fn from_i16x8(v: i16x8) -> Self { pick! { if #[cfg(target_feature="avx2")] { i32x8 { avx2:convert_to_i32_m256i_from_i16_m128i(v.sse) } } else if #[cfg(target_feature="sse2")] { i32x8 { a: i32x4 { sse: shr_imm_i32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) }, b: i32x4 { sse: shr_imm_i32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) }, } } else { i32x8::new([ i32::from(v.as_array_ref()[0]), i32::from(v.as_array_ref()[1]), i32::from(v.as_array_ref()[2]), i32::from(v.as_array_ref()[3]), i32::from(v.as_array_ref()[4]), i32::from(v.as_array_ref()[5]), i32::from(v.as_array_ref()[6]), i32::from(v.as_array_ref()[7]), ]) } } } /// widens and zero extends to `i32x8` #[inline] #[must_use] pub fn from_u16x8(v: u16x8) -> Self { pick! { if #[cfg(target_feature="avx2")] { i32x8 { avx2:convert_to_i32_m256i_from_u16_m128i(v.sse) } } else if #[cfg(target_feature="sse2")] { i32x8 { a: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) }, b: i32x4 { sse: shr_imm_u32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) }, } } else { i32x8::new([ i32::from(v.as_array_ref()[0]), i32::from(v.as_array_ref()[1]), i32::from(v.as_array_ref()[2]), i32::from(v.as_array_ref()[3]), i32::from(v.as_array_ref()[4]), i32::from(v.as_array_ref()[5]), i32::from(v.as_array_ref()[6]), i32::from(v.as_array_ref()[7]), ]) } } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: blend_varying_i8_m256i(f.avx2, t.avx2, self.avx2) } } else { Self { a : self.a.blend(t.a, f.a), b : self.b.blend(t.b, f.b) } } } } /// horizontal add of all the elements of the vector #[inline] #[must_use] pub fn reduce_add(self) -> i32 { let arr: [i32x4; 2] = cast(self); (arr[0] + arr[1]).reduce_add() } /// horizontal max of all the elements of the vector #[inline] #[must_use] pub fn reduce_max(self) -> i32 { let arr: [i32x4; 2] = cast(self); arr[0].max(arr[1]).reduce_max() } /// horizontal min of all the elements of the vector #[inline] #[must_use] pub fn reduce_min(self) -> i32 { let arr: [i32x4; 2] = cast(self); arr[0].min(arr[1]).reduce_min() } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: abs_i32_m256i(self.avx2) } } else { Self { a : self.a.abs(), b : self.b.abs(), } } } } #[inline] #[must_use] pub fn unsigned_abs(self) -> u32x8 { pick! { if #[cfg(target_feature="avx2")] { u32x8 { avx2: abs_i32_m256i(self.avx2) } } else { u32x8 { a : self.a.unsigned_abs(), b : self.b.unsigned_abs(), } } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: max_i32_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.max(rhs.a), b : self.b.max(rhs.b), } } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: min_i32_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.min(rhs.a), b : self.b.min(rhs.b), } } } } #[inline] #[must_use] pub fn round_float(self) -> f32x8 { pick! { if #[cfg(target_feature="avx2")] { cast(convert_to_m256_from_i32_m256i(self.avx2)) } else { cast([ self.a.round_float(), self.b.round_float(), ]) } } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="avx2")] { // use f32 move_mask since it is the same size as i32 move_mask_m256(cast(self.avx2)) } else { self.a.move_mask() | (self.b.move_mask() << 4) } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { move_mask_m256(cast(self.avx2)) != 0 } else { (self.a | self.b).any() } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { move_mask_m256(cast(self.avx2)) == 0b11111111 } else { (self.a & self.b).all() } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } /// Transpose matrix of 8x8 `i32` matrix. Currently only accelerated on AVX2. #[must_use] #[inline] pub fn transpose(data: [i32x8; 8]) -> [i32x8; 8] { pick! { if #[cfg(target_feature="avx2")] { let a0 = unpack_low_i32_m256i(data[0].avx2, data[1].avx2); let a1 = unpack_high_i32_m256i(data[0].avx2, data[1].avx2); let a2 = unpack_low_i32_m256i(data[2].avx2, data[3].avx2); let a3 = unpack_high_i32_m256i(data[2].avx2, data[3].avx2); let a4 = unpack_low_i32_m256i(data[4].avx2, data[5].avx2); let a5 = unpack_high_i32_m256i(data[4].avx2, data[5].avx2); let a6 = unpack_low_i32_m256i(data[6].avx2, data[7].avx2); let a7 = unpack_high_i32_m256i(data[6].avx2, data[7].avx2); pub const fn mm_shuffle(z: i32, y: i32, x: i32, w: i32) -> i32 { (z << 6) | (y << 4) | (x << 2) | w } const SHUFF_LO : i32 = mm_shuffle(1,0,1,0); const SHUFF_HI : i32 = mm_shuffle(3,2,3,2); // possible todo: intel performance manual suggests alternative with blend to avoid port 5 pressure // (since blend runs on a different port than shuffle) let b0 = cast::(shuffle_m256::(cast(a0),cast(a2))); let b1 = cast::(shuffle_m256::(cast(a0),cast(a2))); let b2 = cast::(shuffle_m256::(cast(a1),cast(a3))); let b3 = cast::(shuffle_m256::(cast(a1),cast(a3))); let b4 = cast::(shuffle_m256::(cast(a4),cast(a6))); let b5 = cast::(shuffle_m256::(cast(a4),cast(a6))); let b6 = cast::(shuffle_m256::(cast(a5),cast(a7))); let b7 = cast::(shuffle_m256::(cast(a5),cast(a7))); [ i32x8 { avx2: permute2z_m256i::<0x20>(b0, b4) }, i32x8 { avx2: permute2z_m256i::<0x20>(b1, b5) }, i32x8 { avx2: permute2z_m256i::<0x20>(b2, b6) }, i32x8 { avx2: permute2z_m256i::<0x20>(b3, b7) }, i32x8 { avx2: permute2z_m256i::<0x31>(b0, b4) }, i32x8 { avx2: permute2z_m256i::<0x31>(b1, b5) }, i32x8 { avx2: permute2z_m256i::<0x31>(b2, b6) }, i32x8 { avx2: permute2z_m256i::<0x31>(b3, b7) } ] } else { // possible todo: not sure that 128bit SIMD gives us a a lot of speedup here #[inline(always)] fn transpose_column(data: &[i32x8; 8], index: usize) -> i32x8 { i32x8::new([ data[0].as_array_ref()[index], data[1].as_array_ref()[index], data[2].as_array_ref()[index], data[3].as_array_ref()[index], data[4].as_array_ref()[index], data[5].as_array_ref()[index], data[6].as_array_ref()[index], data[7].as_array_ref()[index], ]) } [ transpose_column(&data, 0), transpose_column(&data, 1), transpose_column(&data, 2), transpose_column(&data, 3), transpose_column(&data, 4), transpose_column(&data, 5), transpose_column(&data, 6), transpose_column(&data, 7), ] } } } #[inline] pub fn to_array(self) -> [i32; 8] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[i32; 8] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [i32; 8] { cast_mut(self) } } impl Not for i32x8 { type Output = Self; #[inline] fn not(self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: self.avx2.not() } } else { Self { a : self.a.not(), b : self.b.not(), } } } } } wide-0.7.32/src/i64x2_.rs000066400000000000000000000327731473735473700147430ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct i64x2 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct i64x2 { pub(crate) simd: v128 } impl Default for i64x2 { fn default() -> Self { Self::splat(0) } } impl PartialEq for i64x2 { fn eq(&self, other: &Self) -> bool { u64x2_all_true(i64x2_eq(self.simd, other.simd)) } } impl Eq for i64x2 { } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct i64x2 { pub(crate) neon : int64x2_t } impl Default for i64x2 { #[inline] #[must_use] fn default() -> Self { unsafe { Self { neon: vdupq_n_s64(0)} } } } impl PartialEq for i64x2 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { vgetq_lane_s64(self.neon,0) == vgetq_lane_s64(other.neon,0) && vgetq_lane_s64(self.neon,1) == vgetq_lane_s64(other.neon,1) } } } impl Eq for i64x2 { } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct i64x2 { arr: [i64;2] } } } int_uint_consts!(i64, 2, i64x2, 128); unsafe impl Zeroable for i64x2 {} unsafe impl Pod for i64x2 {} impl Add for i64x2 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_i64_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i64x2_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_s64(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_add(rhs.arr[0]), self.arr[1].wrapping_add(rhs.arr[1]), ]} } } } } impl Sub for i64x2 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_i64_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i64x2_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vsubq_s64(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_sub(rhs.arr[0]), self.arr[1].wrapping_sub(rhs.arr[1]), ]} } } } } //we should try to implement this on sse2 impl Mul for i64x2 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="simd128")] { Self { simd: i64x2_mul(self.simd, rhs.simd) } } else { let arr1: [i64; 2] = cast(self); let arr2: [i64; 2] = cast(rhs); cast([ arr1[0].wrapping_mul(arr2[0]), arr1[1].wrapping_mul(arr2[1]), ]) } } } } impl Add for i64x2 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: i64) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for i64x2 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: i64) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for i64x2 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: i64) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for i64 { type Output = i64x2; #[inline] #[must_use] fn add(self, rhs: i64x2) -> Self::Output { i64x2::splat(self).add(rhs) } } impl Sub for i64 { type Output = i64x2; #[inline] #[must_use] fn sub(self, rhs: i64x2) -> Self::Output { i64x2::splat(self).sub(rhs) } } impl Mul for i64 { type Output = i64x2; #[inline] #[must_use] fn mul(self, rhs: i64x2) -> Self::Output { i64x2::splat(self).mul(rhs) } } impl BitAnd for i64x2 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitand_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vandq_s64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitand(rhs.arr[0]), self.arr[1].bitand(rhs.arr[1]), ]} } } } } impl BitOr for i64x2 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vorrq_s64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitor(rhs.arr[0]), self.arr[1].bitor(rhs.arr[1]), ]} } } } } impl BitXor for i64x2 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitxor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: veorq_s64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitxor(rhs.arr[0]), self.arr[1].bitxor(rhs.arr[1]), ]} } } } } macro_rules! impl_shl_t_for_i64x2 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for i64x2 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shl_all_u64_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: i64x2_shl(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_s64(self.neon, vmovq_n_s64(rhs as i64)) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] << u, self.arr[1] << u, ]} } } } })+ }; } impl_shl_t_for_i64x2!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_i64x2 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for i64x2 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="simd128")] { Self { simd: i64x2_shr(self.simd, rhs as u32) } } else { let u = rhs as u64; let arr: [i64; 2] = cast(self); cast([ arr[0] >> u, arr[1] >> u, ]) } } } })+ }; } impl_shr_t_for_i64x2!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); impl CmpEq for i64x2 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: cmp_eq_mask_i64_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i64x2_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s64_u64(vceqq_s64(self.neon, rhs.neon)) }} } else { let s: [i64;2] = cast(self); let r: [i64;2] = cast(rhs); cast([ if s[0] == r[0] { -1_i64 } else { 0 }, if s[1] == r[1] { -1_i64 } else { 0 }, ]) } } } } impl CmpGt for i64x2 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse4.2")] { Self { sse: cmp_gt_mask_i64_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i64x2_gt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s64_u64(vcgtq_s64(self.neon, rhs.neon)) }} } else { let s: [i64;2] = cast(self); let r: [i64;2] = cast(rhs); cast([ if s[0] > r[0] { -1_i64 } else { 0 }, if s[1] > r[1] { -1_i64 } else { 0 }, ]) } } } } impl CmpLt for i64x2 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse4.2")] { Self { sse: !cmp_gt_mask_i64_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i64x2_lt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s64_u64(vcltq_s64(self.neon, rhs.neon)) }} } else { let s: [i64;2] = cast(self); let r: [i64;2] = cast(rhs); cast([ if s[0] < r[0] { -1_i64 } else { 0 }, if s[1] < r[1] { -1_i64 } else { 0 }, ]) } } } } impl i64x2 { #[inline] #[must_use] pub const fn new(array: [i64; 2]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vbslq_s64(vreinterpretq_u64_s64(self.neon), t.neon, f.neon) }} } else { generic_bit_blend(self, t, f) } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { // x86 doesn't have this builtin if #[cfg(target_feature="simd128")] { Self { simd: i64x2_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vabsq_s64(self.neon) }} } else { let arr: [i64; 2] = cast(self); cast( [ arr[0].wrapping_abs(), arr[1].wrapping_abs(), ]) } } } #[inline] #[must_use] pub fn unsigned_abs(self) -> u64x2 { pick! { // x86 doesn't have this builtin if #[cfg(target_feature="simd128")] { u64x2 { simd: i64x2_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {u64x2 { neon: vreinterpretq_u64_s64(vabsq_s64(self.neon)) }} } else { let arr: [i64; 2] = cast(self); cast( [ arr[0].unsigned_abs(), arr[1].unsigned_abs(), ]) } } } #[inline] #[must_use] pub fn round_float(self) -> f64x2 { let arr: [i64; 2] = cast(self); cast([arr[0] as f64, arr[1] as f64]) } /// returns the bit mask for each high bit set in the vector with the lowest /// lane being the lowest bit #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="sse")] { // use f64 move_mask since it is the same size as i64 move_mask_m128d(cast(self.sse)) } else if #[cfg(target_feature="simd128")] { i64x2_bitmask(self.simd) as i32 } else { // nothing amazingly efficient for neon let arr: [u64; 2] = cast(self); (arr[0] >> 63 | ((arr[1] >> 62) & 2)) as i32 } } } /// true if any high bits are set for any value in the vector #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="sse")] { // use f64 move_mask since it is the same size as i64 move_mask_m128d(cast(self.sse)) != 0 } else if #[cfg(target_feature="simd128")] { i64x2_bitmask(self.simd) != 0 } else { let v : [u64;2] = cast(self); ((v[0] | v[1]) & 0x8000000000000000) != 0 } } } /// true if all high bits are set for every value in the vector #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { // use f64 move_mask since it is the same size as i64 move_mask_m128d(cast(self.sse)) == 0b11 } else if #[cfg(target_feature="simd128")] { i64x2_bitmask(self.simd) == 0b11 } else { let v : [u64;2] = cast(self); ((v[0] & v[1]) & 0x8000000000000000) == 0x8000000000000000 } } } /// true if no high bits are set for any values of the vector #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] pub fn to_array(self) -> [i64; 2] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[i64; 2] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [i64; 2] { cast_mut(self) } } wide-0.7.32/src/i64x4_.rs000066400000000000000000000225201473735473700147320ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct i64x4 { pub(crate) avx2: m256i } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct i64x4 { pub(crate) a : i64x2, pub(crate) b : i64x2 } } } int_uint_consts!(i64, 4, i64x4, 256); unsafe impl Zeroable for i64x4 {} unsafe impl Pod for i64x4 {} impl Add for i64x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: add_i64_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.add(rhs.a), b : self.b.add(rhs.b), } } } } } impl Sub for i64x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: sub_i64_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.sub(rhs.a), b : self.b.sub(rhs.b), } } } } } impl Mul for i64x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let arr1: [i64; 4] = cast(self); let arr2: [i64; 4] = cast(rhs); cast([ arr1[0].wrapping_mul(arr2[0]), arr1[1].wrapping_mul(arr2[1]), arr1[2].wrapping_mul(arr2[2]), arr1[3].wrapping_mul(arr2[3]), ]) } else { Self { a: self.a.mul(rhs.a), b: self.b.mul(rhs.b) } } } } } impl Add for i64x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: i64) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for i64x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: i64) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for i64x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: i64) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for i64 { type Output = i64x4; #[inline] #[must_use] fn add(self, rhs: i64x4) -> Self::Output { i64x4::splat(self).add(rhs) } } impl Sub for i64 { type Output = i64x4; #[inline] #[must_use] fn sub(self, rhs: i64x4) -> Self::Output { i64x4::splat(self).sub(rhs) } } impl Mul for i64 { type Output = i64x4; #[inline] #[must_use] fn mul(self, rhs: i64x4) -> Self::Output { i64x4::splat(self).mul(rhs) } } impl BitAnd for i64x4 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitand_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitand(rhs.a), b : self.b.bitand(rhs.b), } } } } } impl BitOr for i64x4 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitor(rhs.a), b : self.b.bitor(rhs.b), } } } } } impl BitXor for i64x4 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitxor(rhs.a), b : self.b.bitxor(rhs.b), } } } } } macro_rules! impl_shl_t_for_i64x4 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for i64x4 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shl_all_u64_m256i(self.avx2, shift) } } else { Self { a : self.a.shl(rhs), b : self.b.shl(rhs), } } } } })+ }; } impl_shl_t_for_i64x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_i64x4 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for i64x4 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shr_all_u64_m256i(self.avx2, shift) } } else { Self { a : self.a.shr(rhs), b : self.b.shr(rhs), } } } } })+ }; } impl_shr_t_for_i64x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); impl CmpEq for i64x4 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: cmp_eq_mask_i64_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.cmp_eq(rhs.a), b : self.b.cmp_eq(rhs.b), } } } } } impl CmpGt for i64x4 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: cmp_gt_mask_i64_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.cmp_gt(rhs.a), b : self.b.cmp_gt(rhs.b), } } } } } impl CmpLt for i64x4 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: !(cmp_gt_mask_i64_m256i(self.avx2, rhs.avx2) ^ cmp_eq_mask_i64_m256i(self.avx2, rhs.avx2)) } } else { Self { a : self.a.cmp_lt(rhs.a), b : self.b.cmp_lt(rhs.b), } } } } } impl i64x4 { #[inline] #[must_use] pub const fn new(array: [i64; 4]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: blend_varying_i8_m256i(f.avx2,t.avx2,self.avx2) } } else { Self { a : self.a.blend(t.a, f.a), b : self.b.blend(t.b, f.b), } } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="avx2")] { // avx x86 doesn't have this builtin let arr: [i64; 4] = cast(self); cast( [ arr[0].wrapping_abs(), arr[1].wrapping_abs(), arr[2].wrapping_abs(), arr[3].wrapping_abs(), ]) } else { Self { a : self.a.abs(), b : self.b.abs(), } } } } #[inline] #[must_use] pub fn unsigned_abs(self) -> u64x4 { pick! { if #[cfg(target_feature="avx2")] { // avx x86 doesn't have this builtin let arr: [i64; 4] = cast(self); cast( [ arr[0].unsigned_abs(), arr[1].unsigned_abs(), arr[2].unsigned_abs(), arr[3].unsigned_abs(), ]) } else { u64x4 { a : self.a.unsigned_abs(), b : self.b.unsigned_abs(), } } } } #[inline] #[must_use] pub fn round_float(self) -> f64x4 { let arr: [i64; 4] = cast(self); cast([arr[0] as f64, arr[1] as f64, arr[2] as f64, arr[3] as f64]) } /// returns the bit mask for each high bit set in the vector with the lowest /// lane being the lowest bit #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="avx2")] { // use f64 move_mask since it is the same size as i64 move_mask_m256d(cast(self.avx2)) } else { self.a.move_mask() | (self.b.move_mask() << 2) } } } /// true if any high bits are set for any value in the vector #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { move_mask_m256d(cast(self.avx2)) != 0 } else { (self.a | self.b).any() } } } /// true if all high bits are set for every value in the vector #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { move_mask_m256d(cast(self.avx2)) == 0b1111 } else { (self.a & self.b).all() } } } /// true if no high bits are set for any values of the vector #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] pub fn to_array(self) -> [i64; 4] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[i64; 4] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [i64; 4] { cast_mut(self) } } impl Not for i64x4 { type Output = Self; #[inline] fn not(self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: self.avx2.not() } } else { Self { a : self.a.not(), b : self.b.not(), } } } } } wide-0.7.32/src/i8x16_.rs000066400000000000000000000673471473735473700147530ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct i8x16 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct i8x16 { pub(crate) simd: v128 } impl Default for i8x16 { fn default() -> Self { Self::splat(0) } } impl PartialEq for i8x16 { fn eq(&self, other: &Self) -> bool { u8x16_all_true(i8x16_eq(self.simd, other.simd)) } } impl Eq for i8x16 { } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct i8x16 { pub(crate) neon : int8x16_t } impl Default for i8x16 { #[inline] #[must_use] fn default() -> Self { Self::splat(0) } } impl PartialEq for i8x16 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { vminvq_u8(vceqq_s8(self.neon, other.neon))==u8::MAX } } } impl Eq for i8x16 { } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct i8x16 { arr: [i8;16] } } } int_uint_consts!(i8, 16, i8x16, 128); unsafe impl Zeroable for i8x16 {} unsafe impl Pod for i8x16 {} impl Add for i8x16 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_s8(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_add(rhs.arr[0]), self.arr[1].wrapping_add(rhs.arr[1]), self.arr[2].wrapping_add(rhs.arr[2]), self.arr[3].wrapping_add(rhs.arr[3]), self.arr[4].wrapping_add(rhs.arr[4]), self.arr[5].wrapping_add(rhs.arr[5]), self.arr[6].wrapping_add(rhs.arr[6]), self.arr[7].wrapping_add(rhs.arr[7]), self.arr[8].wrapping_add(rhs.arr[8]), self.arr[9].wrapping_add(rhs.arr[9]), self.arr[10].wrapping_add(rhs.arr[10]), self.arr[11].wrapping_add(rhs.arr[11]), self.arr[12].wrapping_add(rhs.arr[12]), self.arr[13].wrapping_add(rhs.arr[13]), self.arr[14].wrapping_add(rhs.arr[14]), self.arr[15].wrapping_add(rhs.arr[15]), ]} } } } } impl Sub for i8x16 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vsubq_s8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].wrapping_sub(rhs.arr[0]), self.arr[1].wrapping_sub(rhs.arr[1]), self.arr[2].wrapping_sub(rhs.arr[2]), self.arr[3].wrapping_sub(rhs.arr[3]), self.arr[4].wrapping_sub(rhs.arr[4]), self.arr[5].wrapping_sub(rhs.arr[5]), self.arr[6].wrapping_sub(rhs.arr[6]), self.arr[7].wrapping_sub(rhs.arr[7]), self.arr[8].wrapping_sub(rhs.arr[8]), self.arr[9].wrapping_sub(rhs.arr[9]), self.arr[10].wrapping_sub(rhs.arr[10]), self.arr[11].wrapping_sub(rhs.arr[11]), self.arr[12].wrapping_sub(rhs.arr[12]), self.arr[13].wrapping_sub(rhs.arr[13]), self.arr[14].wrapping_sub(rhs.arr[14]), self.arr[15].wrapping_sub(rhs.arr[15]), ]} } } } } impl Add for i8x16 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: i8) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for i8x16 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: i8) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Add for i8 { type Output = i8x16; #[inline] #[must_use] fn add(self, rhs: i8x16) -> Self::Output { i8x16::splat(self).add(rhs) } } impl Sub for i8 { type Output = i8x16; #[inline] #[must_use] fn sub(self, rhs: i8x16) -> Self::Output { i8x16::splat(self).sub(rhs) } } impl BitAnd for i8x16 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitand_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vandq_s8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitand(rhs.arr[0]), self.arr[1].bitand(rhs.arr[1]), self.arr[2].bitand(rhs.arr[2]), self.arr[3].bitand(rhs.arr[3]), self.arr[4].bitand(rhs.arr[4]), self.arr[5].bitand(rhs.arr[5]), self.arr[6].bitand(rhs.arr[6]), self.arr[7].bitand(rhs.arr[7]), self.arr[8].bitand(rhs.arr[8]), self.arr[9].bitand(rhs.arr[9]), self.arr[10].bitand(rhs.arr[10]), self.arr[11].bitand(rhs.arr[11]), self.arr[12].bitand(rhs.arr[12]), self.arr[13].bitand(rhs.arr[13]), self.arr[14].bitand(rhs.arr[14]), self.arr[15].bitand(rhs.arr[15]), ]} } } } } impl BitOr for i8x16 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vorrq_s8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitor(rhs.arr[0]), self.arr[1].bitor(rhs.arr[1]), self.arr[2].bitor(rhs.arr[2]), self.arr[3].bitor(rhs.arr[3]), self.arr[4].bitor(rhs.arr[4]), self.arr[5].bitor(rhs.arr[5]), self.arr[6].bitor(rhs.arr[6]), self.arr[7].bitor(rhs.arr[7]), self.arr[8].bitor(rhs.arr[8]), self.arr[9].bitor(rhs.arr[9]), self.arr[10].bitor(rhs.arr[10]), self.arr[11].bitor(rhs.arr[11]), self.arr[12].bitor(rhs.arr[12]), self.arr[13].bitor(rhs.arr[13]), self.arr[14].bitor(rhs.arr[14]), self.arr[15].bitor(rhs.arr[15]), ]} } } } } impl BitXor for i8x16 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitxor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: veorq_s8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitxor(rhs.arr[0]), self.arr[1].bitxor(rhs.arr[1]), self.arr[2].bitxor(rhs.arr[2]), self.arr[3].bitxor(rhs.arr[3]), self.arr[4].bitxor(rhs.arr[4]), self.arr[5].bitxor(rhs.arr[5]), self.arr[6].bitxor(rhs.arr[6]), self.arr[7].bitxor(rhs.arr[7]), self.arr[8].bitxor(rhs.arr[8]), self.arr[9].bitxor(rhs.arr[9]), self.arr[10].bitxor(rhs.arr[10]), self.arr[11].bitxor(rhs.arr[11]), self.arr[12].bitxor(rhs.arr[12]), self.arr[13].bitxor(rhs.arr[13]), self.arr[14].bitxor(rhs.arr[14]), self.arr[15].bitxor(rhs.arr[15]), ]} } } } } impl CmpEq for i8x16 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_eq_mask_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s8_u8(vceqq_s8(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] == rhs.arr[0] { -1 } else { 0 }, if self.arr[1] == rhs.arr[1] { -1 } else { 0 }, if self.arr[2] == rhs.arr[2] { -1 } else { 0 }, if self.arr[3] == rhs.arr[3] { -1 } else { 0 }, if self.arr[4] == rhs.arr[4] { -1 } else { 0 }, if self.arr[5] == rhs.arr[5] { -1 } else { 0 }, if self.arr[6] == rhs.arr[6] { -1 } else { 0 }, if self.arr[7] == rhs.arr[7] { -1 } else { 0 }, if self.arr[8] == rhs.arr[8] { -1 } else { 0 }, if self.arr[9] == rhs.arr[9] { -1 } else { 0 }, if self.arr[10] == rhs.arr[10] { -1 } else { 0 }, if self.arr[11] == rhs.arr[11] { -1 } else { 0 }, if self.arr[12] == rhs.arr[12] { -1 } else { 0 }, if self.arr[13] == rhs.arr[13] { -1 } else { 0 }, if self.arr[14] == rhs.arr[14] { -1 } else { 0 }, if self.arr[15] == rhs.arr[15] { -1 } else { 0 }, ]} } } } } impl CmpGt for i8x16 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_gt_mask_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_gt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s8_u8(vcgtq_s8(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] > rhs.arr[0] { -1 } else { 0 }, if self.arr[1] > rhs.arr[1] { -1 } else { 0 }, if self.arr[2] > rhs.arr[2] { -1 } else { 0 }, if self.arr[3] > rhs.arr[3] { -1 } else { 0 }, if self.arr[4] > rhs.arr[4] { -1 } else { 0 }, if self.arr[5] > rhs.arr[5] { -1 } else { 0 }, if self.arr[6] > rhs.arr[6] { -1 } else { 0 }, if self.arr[7] > rhs.arr[7] { -1 } else { 0 }, if self.arr[8] > rhs.arr[8] { -1 } else { 0 }, if self.arr[9] > rhs.arr[9] { -1 } else { 0 }, if self.arr[10] > rhs.arr[10] { -1 } else { 0 }, if self.arr[11] > rhs.arr[11] { -1 } else { 0 }, if self.arr[12] > rhs.arr[12] { -1 } else { 0 }, if self.arr[13] > rhs.arr[13] { -1 } else { 0 }, if self.arr[14] > rhs.arr[14] { -1 } else { 0 }, if self.arr[15] > rhs.arr[15] { -1 } else { 0 }, ]} } } } } impl CmpLt for i8x16 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_lt_mask_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_lt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vreinterpretq_s8_u8(vcltq_s8(self.neon, rhs.neon)) }} } else { Self { arr: [ if self.arr[0] < rhs.arr[0] { -1 } else { 0 }, if self.arr[1] < rhs.arr[1] { -1 } else { 0 }, if self.arr[2] < rhs.arr[2] { -1 } else { 0 }, if self.arr[3] < rhs.arr[3] { -1 } else { 0 }, if self.arr[4] < rhs.arr[4] { -1 } else { 0 }, if self.arr[5] < rhs.arr[5] { -1 } else { 0 }, if self.arr[6] < rhs.arr[6] { -1 } else { 0 }, if self.arr[7] < rhs.arr[7] { -1 } else { 0 }, if self.arr[8] < rhs.arr[8] { -1 } else { 0 }, if self.arr[9] < rhs.arr[9] { -1 } else { 0 }, if self.arr[10] < rhs.arr[10] { -1 } else { 0 }, if self.arr[11] < rhs.arr[11] { -1 } else { 0 }, if self.arr[12] < rhs.arr[12] { -1 } else { 0 }, if self.arr[13] < rhs.arr[13] { -1 } else { 0 }, if self.arr[14] < rhs.arr[14] { -1 } else { 0 }, if self.arr[15] < rhs.arr[15] { -1 } else { 0 }, ]} } } } } impl i8x16 { #[inline] #[must_use] pub const fn new(array: [i8; 16]) -> Self { unsafe { core::intrinsics::transmute(array) } } /// converts `i16` to `i8`, saturating values that are too large #[inline] #[must_use] pub fn from_i16x16_saturate(v: i16x16) -> i8x16 { pick! { if #[cfg(target_feature="avx2")] { i8x16 { sse: pack_i16_to_i8_m128i( extract_m128i_from_m256i::<0>(v.avx2), extract_m128i_from_m256i::<1>(v.avx2)) } } else if #[cfg(target_feature="sse2")] { i8x16 { sse: pack_i16_to_i8_m128i( v.a.sse, v.b.sse ) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { use core::arch::aarch64::*; unsafe { i8x16 { neon: vcombine_s8(vqmovn_s16(v.a.neon), vqmovn_s16(v.b.neon)) } } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; i8x16 { simd: i8x16_narrow_i16x8(v.a.simd, v.b.simd) } } else { fn clamp(a : i16) -> i8 { if a < i8::MIN as i16 { i8::MIN } else if a > i8::MAX as i16 { i8::MAX } else { a as i8 } } i8x16::new([ clamp(v.as_array_ref()[0]), clamp(v.as_array_ref()[1]), clamp(v.as_array_ref()[2]), clamp(v.as_array_ref()[3]), clamp(v.as_array_ref()[4]), clamp(v.as_array_ref()[5]), clamp(v.as_array_ref()[6]), clamp(v.as_array_ref()[7]), clamp(v.as_array_ref()[8]), clamp(v.as_array_ref()[9]), clamp(v.as_array_ref()[10]), clamp(v.as_array_ref()[11]), clamp(v.as_array_ref()[12]), clamp(v.as_array_ref()[13]), clamp(v.as_array_ref()[14]), clamp(v.as_array_ref()[15]), ]) } } } /// converts `i16` to `i8`, truncating the upper bits if they are set #[inline] #[must_use] pub fn from_i16x16_truncate(v: i16x16) -> i8x16 { pick! { if #[cfg(target_feature="avx2")] { let a = v.avx2.bitand(set_splat_i16_m256i(0xff)); i8x16 { sse: pack_i16_to_u8_m128i( extract_m128i_from_m256i::<0>(a), extract_m128i_from_m256i::<1>(a)) } } else if #[cfg(target_feature="sse2")] { let mask = set_splat_i16_m128i(0xff); i8x16 { sse: pack_i16_to_u8_m128i( v.a.sse.bitand(mask), v.b.sse.bitand(mask) ) } } else { // no super good intrinsics on other platforms... plain old codegen does a reasonable job i8x16::new([ v.as_array_ref()[0] as i8, v.as_array_ref()[1] as i8, v.as_array_ref()[2] as i8, v.as_array_ref()[3] as i8, v.as_array_ref()[4] as i8, v.as_array_ref()[5] as i8, v.as_array_ref()[6] as i8, v.as_array_ref()[7] as i8, v.as_array_ref()[8] as i8, v.as_array_ref()[9] as i8, v.as_array_ref()[10] as i8, v.as_array_ref()[11] as i8, v.as_array_ref()[12] as i8, v.as_array_ref()[13] as i8, v.as_array_ref()[14] as i8, v.as_array_ref()[15] as i8, ]) } } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vbslq_s8(vreinterpretq_u8_s8(self.neon), t.neon, f.neon) }} } else { generic_bit_blend(self, t, f) } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="ssse3")] { Self { sse: abs_i8_m128i(self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vabsq_s8(self.neon) }} } else { let arr: [i8; 16] = cast(self); cast([ arr[0].wrapping_abs(), arr[1].wrapping_abs(), arr[2].wrapping_abs(), arr[3].wrapping_abs(), arr[4].wrapping_abs(), arr[5].wrapping_abs(), arr[6].wrapping_abs(), arr[7].wrapping_abs(), arr[8].wrapping_abs(), arr[9].wrapping_abs(), arr[10].wrapping_abs(), arr[11].wrapping_abs(), arr[12].wrapping_abs(), arr[13].wrapping_abs(), arr[14].wrapping_abs(), arr[15].wrapping_abs(), ]) } } } #[inline] #[must_use] pub fn unsigned_abs(self) -> u8x16 { pick! { if #[cfg(target_feature="ssse3")] { u8x16 { sse: abs_i8_m128i(self.sse) } } else if #[cfg(target_feature="simd128")] { u8x16 { simd: i8x16_abs(self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { u8x16 { neon: vreinterpretq_u8_s8(vabsq_s8(self.neon)) }} } else { let arr: [i8; 16] = cast(self); cast( [ arr[0].unsigned_abs(), arr[1].unsigned_abs(), arr[2].unsigned_abs(), arr[3].unsigned_abs(), arr[4].unsigned_abs(), arr[5].unsigned_abs(), arr[6].unsigned_abs(), arr[7].unsigned_abs(), arr[8].unsigned_abs(), arr[9].unsigned_abs(), arr[10].unsigned_abs(), arr[11].unsigned_abs(), arr[12].unsigned_abs(), arr[13].unsigned_abs(), arr[14].unsigned_abs(), arr[15].unsigned_abs(), ]) } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: max_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_max(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxq_s8(self.neon, rhs.neon) }} } else { self.cmp_lt(rhs).blend(rhs, self) } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: min_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_min(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminq_s8(self.neon, rhs.neon) }} } else { self.cmp_lt(rhs).blend(self, rhs) } } } #[inline] #[must_use] pub fn from_slice_unaligned(input: &[i8]) -> Self { assert!(input.len() >= 16); pick! { if #[cfg(target_feature="sse2")] { unsafe { Self { sse: load_unaligned_m128i( &*(input.as_ptr() as * const [u8;16]) ) } } } else if #[cfg(target_feature="simd128")] { unsafe { Self { simd: v128_load(input.as_ptr() as *const v128 ) } } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vld1q_s8( input.as_ptr() as *const i8 ) } } } else { // 2018 edition doesn't have try_into unsafe { Self::new( *(input.as_ptr() as * const [i8;16]) ) } } } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="sse2")] { move_mask_i8_m128i(self.sse) } else if #[cfg(target_feature="simd128")] { i8x16_bitmask(self.simd) as i32 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { // set all to 1 if top bit is set, else 0 let masked = vcltq_s8(self.neon, vdupq_n_s8(0)); // select the right bit out of each lane let selectbit : uint8x16_t = core::intrinsics::transmute([1u8, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128]); let out = vandq_u8(masked, selectbit); // interleave the lanes so that a 16-bit sum accumulates the bits in the right order let table : uint8x16_t = core::intrinsics::transmute([0u8, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15]); let r = vqtbl1q_u8(out, table); // horizontally add the 16-bit lanes vaddvq_u16(vreinterpretq_u16_u8(r)) as i32 } } else { ((self.arr[0] < 0) as i32) << 0 | ((self.arr[1] < 0) as i32) << 1 | ((self.arr[2] < 0) as i32) << 2 | ((self.arr[3] < 0) as i32) << 3 | ((self.arr[4] < 0) as i32) << 4 | ((self.arr[5] < 0) as i32) << 5 | ((self.arr[6] < 0) as i32) << 6 | ((self.arr[7] < 0) as i32) << 7 | ((self.arr[8] < 0) as i32) << 8 | ((self.arr[9] < 0) as i32) << 9 | ((self.arr[10] < 0) as i32) << 10 | ((self.arr[11] < 0) as i32) << 11 | ((self.arr[12] < 0) as i32) << 12 | ((self.arr[13] < 0) as i32) << 13 | ((self.arr[14] < 0) as i32) << 14 | ((self.arr[15] < 0) as i32) << 15 } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="sse2")] { move_mask_i8_m128i(self.sse) != 0 } else if #[cfg(target_feature="simd128")] { u8x16_bitmask(self.simd) != 0 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe { vminvq_s8(self.neon) < 0 } } else { let v : [u64;2] = cast(self); ((v[0] | v[1]) & 0x80808080808080) != 0 } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="sse2")] { move_mask_i8_m128i(self.sse) == 0b1111_1111_1111_1111 } else if #[cfg(target_feature="simd128")] { u8x16_bitmask(self.simd) == 0b1111_1111_1111_1111 } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe { vmaxvq_s8(self.neon) < 0 } } else { let v : [u64;2] = cast(self); (v[0] & v[1] & 0x80808080808080) == 0x80808080808080 } } } /// Returns a new vector where each element is based on the index values in /// `rhs`. /// /// * Index values in the range `[0, 15]` select the i-th element of `self`. /// * Index values that are out of range will cause that output lane to be /// `0`. #[inline] pub fn swizzle(self, rhs: i8x16) -> i8x16 { pick! { if #[cfg(target_feature="ssse3")] { Self { sse: shuffle_av_i8z_all_m128i(self.sse, add_saturating_u8_m128i(rhs.sse, set_splat_i8_m128i(0x70))) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_swizzle(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe { Self { neon: vqtbl1q_s8(self.neon, vreinterpretq_u8_s8(rhs.neon)) } } } else { let idxs = rhs.to_array(); let arr = self.to_array(); let mut out = [0i8;16]; for i in 0..16 { let idx = idxs[i] as usize; if idx >= 16 { out[i] = 0; } else { out[i] = arr[idx]; } } Self::new(out) } } } /// Works like [`swizzle`](Self::swizzle) with the following additional /// details /// /// * Indices in the range `[0, 15]` will select the i-th element of `self`. /// * If the high bit of any index is set (meaning that the index is /// negative), then the corresponding output lane is guaranteed to be zero. /// * Otherwise the output lane is either `0` or `self[rhs[i] % 16]`, /// depending on the implementation. #[inline] pub fn swizzle_relaxed(self, rhs: i8x16) -> i8x16 { pick! { if #[cfg(target_feature="ssse3")] { Self { sse: shuffle_av_i8z_all_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_swizzle(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe { Self { neon: vqtbl1q_s8(self.neon, vreinterpretq_u8_s8(rhs.neon)) } } } else { let idxs = rhs.to_array(); let arr = self.to_array(); let mut out = [0i8;16]; for i in 0..16 { let idx = idxs[i] as usize; if idx >= 16 { out[i] = 0; } else { out[i] = arr[idx]; } } Self::new(out) } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] #[must_use] pub fn saturating_add(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_saturating_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_add_sat(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vqaddq_s8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].saturating_add(rhs.arr[0]), self.arr[1].saturating_add(rhs.arr[1]), self.arr[2].saturating_add(rhs.arr[2]), self.arr[3].saturating_add(rhs.arr[3]), self.arr[4].saturating_add(rhs.arr[4]), self.arr[5].saturating_add(rhs.arr[5]), self.arr[6].saturating_add(rhs.arr[6]), self.arr[7].saturating_add(rhs.arr[7]), self.arr[8].saturating_add(rhs.arr[8]), self.arr[9].saturating_add(rhs.arr[9]), self.arr[10].saturating_add(rhs.arr[10]), self.arr[11].saturating_add(rhs.arr[11]), self.arr[12].saturating_add(rhs.arr[12]), self.arr[13].saturating_add(rhs.arr[13]), self.arr[14].saturating_add(rhs.arr[14]), self.arr[15].saturating_add(rhs.arr[15]), ]} } } } #[inline] #[must_use] pub fn saturating_sub(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_saturating_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: i8x16_sub_sat(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vqsubq_s8(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].saturating_sub(rhs.arr[0]), self.arr[1].saturating_sub(rhs.arr[1]), self.arr[2].saturating_sub(rhs.arr[2]), self.arr[3].saturating_sub(rhs.arr[3]), self.arr[4].saturating_sub(rhs.arr[4]), self.arr[5].saturating_sub(rhs.arr[5]), self.arr[6].saturating_sub(rhs.arr[6]), self.arr[7].saturating_sub(rhs.arr[7]), self.arr[8].saturating_sub(rhs.arr[8]), self.arr[9].saturating_sub(rhs.arr[9]), self.arr[10].saturating_sub(rhs.arr[10]), self.arr[11].saturating_sub(rhs.arr[11]), self.arr[12].saturating_sub(rhs.arr[12]), self.arr[13].saturating_sub(rhs.arr[13]), self.arr[14].saturating_sub(rhs.arr[14]), self.arr[15].saturating_sub(rhs.arr[15]), ]} } } } #[inline] pub fn to_array(self) -> [i8; 16] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[i8; 16] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [i8; 16] { cast_mut(self) } } wide-0.7.32/src/i8x32_.rs000066400000000000000000000207271473735473700147400ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct i8x32 { avx: m256i } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct i8x32 { a : i8x16, b : i8x16 } } } int_uint_consts!(i8, 32, i8x32, 256); unsafe impl Zeroable for i8x32 {} unsafe impl Pod for i8x32 {} impl Add for i8x32 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx: add_i8_m256i(self.avx,rhs.avx) } } else { Self { a : self.a.add(rhs.a), b : self.b.add(rhs.b), } } } } } impl Sub for i8x32 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx: sub_i8_m256i(self.avx,rhs.avx) } } else { Self { a : self.a.sub(rhs.a), b : self.b.sub(rhs.b), } } } } } impl Add for i8x32 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: i8) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for i8x32 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: i8) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Add for i8 { type Output = i8x32; #[inline] #[must_use] fn add(self, rhs: i8x32) -> Self::Output { i8x32::splat(self).add(rhs) } } impl Sub for i8 { type Output = i8x32; #[inline] #[must_use] fn sub(self, rhs: i8x32) -> Self::Output { i8x32::splat(self).sub(rhs) } } impl BitAnd for i8x32 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx : bitand_m256i(self.avx,rhs.avx) } } else { Self { a : self.a.bitand(rhs.a), b : self.b.bitand(rhs.b), } } } } } impl BitOr for i8x32 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx : bitor_m256i(self.avx,rhs.avx) } } else { Self { a : self.a.bitor(rhs.a), b : self.b.bitor(rhs.b), } } } } } impl BitXor for i8x32 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx : bitxor_m256i(self.avx,rhs.avx) } } else { Self { a : self.a.bitxor(rhs.a), b : self.b.bitxor(rhs.b), } } } } } impl CmpEq for i8x32 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx : cmp_eq_mask_i8_m256i(self.avx,rhs.avx) } } else { Self { a : self.a.cmp_eq(rhs.a), b : self.b.cmp_eq(rhs.b), } } } } } impl CmpGt for i8x32 { type Output = Self; #[inline] #[must_use] fn cmp_gt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx : cmp_gt_mask_i8_m256i(self.avx,rhs.avx) } } else { Self { a : self.a.cmp_gt(rhs.a), b : self.b.cmp_gt(rhs.b), } } } } } impl CmpLt for i8x32 { type Output = Self; #[inline] #[must_use] fn cmp_lt(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx : !(cmp_gt_mask_i8_m256i(self.avx,rhs.avx) ^ cmp_eq_mask_i8_m256i(self.avx,rhs.avx)) } } else { Self { a : self.a.cmp_lt(rhs.a), b : self.b.cmp_lt(rhs.b), } } } } } impl i8x32 { #[inline] #[must_use] pub const fn new(array: [i8; 32]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx: blend_varying_i8_m256i(f.avx, t.avx, self.avx) } } else { Self { a : self.a.blend(t.a, f.a), b : self.b.blend(t.b, f.b), } } } } #[inline] #[must_use] pub fn abs(self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx: abs_i8_m256i(self.avx) } } else { Self { a : self.a.abs(), b : self.b.abs(), } } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx: max_i8_m256i(self.avx,rhs.avx) } } else { Self { a : self.a.max(rhs.a), b : self.b.max(rhs.b), } } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx: min_i8_m256i(self.avx,rhs.avx) } } else { Self { a : self.a.min(rhs.a), b : self.b.min(rhs.b), } } } } #[inline] #[must_use] pub fn saturating_add(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx: add_saturating_i8_m256i(self.avx, rhs.avx) } } else { Self { a : self.a.saturating_add(rhs.a), b : self.b.saturating_add(rhs.b), } } } } #[inline] #[must_use] pub fn saturating_sub(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx: sub_saturating_i8_m256i(self.avx, rhs.avx) } } else { Self { a : self.a.saturating_sub(rhs.a), b : self.b.saturating_sub(rhs.b), } } } } #[inline] #[must_use] pub fn move_mask(self) -> i32 { pick! { if #[cfg(target_feature="avx2")] { move_mask_i8_m256i(self.avx) } else { self.a.move_mask() | (self.b.move_mask() << 16) } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { move_mask_i8_m256i(self.avx) != 0 } else { (self.a | self.b).any() } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { move_mask_i8_m256i(self.avx) == -1 } else { (self.a & self.b).all() } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } /// Returns a new vector with lanes selected from the lanes of the first input /// vector a specified in the second input vector `rhs`. /// The indices i in range `[0, 15]` select the i-th element of `self`. For /// indices outside of the range the resulting lane is `0`. /// /// This note that is the equivalent of two parallel swizzle operations on the /// two halves of the vector, and the indexes each refer to the /// corresponding half. #[inline] pub fn swizzle_half(self, rhs: i8x32) -> i8x32 { pick! { if #[cfg(target_feature="avx2")] { Self { avx: shuffle_av_i8z_half_m256i(self.avx, rhs.saturating_add(i8x32::splat(0x60)).avx) } } else { Self { a : self.a.swizzle(rhs.a), b : self.b.swizzle(rhs.b), } } } } /// Indices in the range `[0, 15]` will select the i-th element of `self`. If /// the high bit of any element of `rhs` is set (negative) then the /// corresponding output lane is guaranteed to be zero. Otherwise if the /// element of `rhs` is within the range `[32, 127]` then the output lane is /// either `0` or `self[rhs[i] % 16]` depending on the implementation. /// /// This is the equivalent to two parallel swizzle operations on the two /// halves of the vector, and the indexes each refer to their corresponding /// half. #[inline] pub fn swizzle_half_relaxed(self, rhs: i8x32) -> i8x32 { pick! { if #[cfg(target_feature="avx2")] { Self { avx: shuffle_av_i8z_half_m256i(self.avx, rhs.avx) } } else { Self { a : self.a.swizzle_relaxed(rhs.a), b : self.b.swizzle_relaxed(rhs.b), } } } } #[inline] pub fn to_array(self) -> [i8; 32] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[i8; 32] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [i8; 32] { cast_mut(self) } } wide-0.7.32/src/lib.rs000066400000000000000000001345321473735473700144720ustar00rootroot00000000000000#![no_std] #![allow(non_camel_case_types)] #![warn(clippy::doc_markdown)] #![warn(clippy::missing_inline_in_public_items)] #![allow(clippy::eq_op)] #![allow(clippy::excessive_precision)] #![allow(clippy::let_and_return)] #![allow(clippy::unusual_byte_groupings)] #![allow(clippy::misrefactored_assign_op)] #![cfg_attr(test, allow(clippy::approx_constant))] //! A crate to help you go wide. //! //! This crate provides SIMD-compatible data types. //! //! When possible, explicit SIMD is used with all the math operations here. As a //! fallback, the fact that all the lengths of a fixed length array are doing //! the same thing will often make LLVM notice that it should use SIMD //! instructions to complete the task. In the worst case, the code just becomes //! totally scalar (though the math is still correct, at least). //! //! ## Crate Features //! //! * `std`: This causes the feature to link to `std`. //! * Currently this just improves the performance of `sqrt` when an explicit //! SIMD `sqrt` isn't available. // Note(Lokathor): Due to standard library magic, the std-only methods for f32 // and f64 will automatically be available simply by declaring this. #[cfg(feature = "std")] extern crate std; // TODO // Add/Sub/Mul/Div with constant // Shuffle left/right/by index use core::{ fmt::{ Binary, Debug, Display, LowerExp, LowerHex, Octal, UpperExp, UpperHex, }, ops::*, }; #[allow(unused_imports)] use safe_arch::*; use bytemuck::*; #[cfg(feature = "serde")] use serde::{ser::SerializeTuple, Deserialize, Serialize}; #[macro_use] mod macros; macro_rules! pick { ($(if #[cfg($($test:meta),*)] { $($if_tokens:tt)* })else+ else { $($else_tokens:tt)* }) => { pick!{ @__forests [ ] ; $( [ {$($test),*} {$($if_tokens)*} ], )* [ { } {$($else_tokens)*} ], } }; (if #[cfg($($if_meta:meta),*)] { $($if_tokens:tt)* } $(else if #[cfg($($else_meta:meta),*)] { $($else_tokens:tt)* })*) => { pick!{ @__forests [ ] ; [ {$($if_meta),*} {$($if_tokens)*} ], $( [ {$($else_meta),*} {$($else_tokens)*} ], )* } }; (@__forests [$($not:meta,)*];) => { /* halt expansion */ }; (@__forests [$($not:meta,)*]; [{$($m:meta),*} {$($tokens:tt)*}], $($rest:tt)*) => { #[cfg(all( $($m,)* not(any($($not),*)) ))] pick!{ @__identity $($tokens)* } pick!{ @__forests [ $($not,)* $($m,)* ] ; $($rest)* } }; (@__identity $($tokens:tt)*) => { $($tokens)* }; } // TODO: make these generic over `mul_add`? Worth it? macro_rules! polynomial_2 { ($x:expr, $c0:expr, $c1:expr, $c2:expr $(,)?) => {{ let x = $x; let x2 = x * x; x2.mul_add($c2, x.mul_add($c1, $c0)) }}; } macro_rules! polynomial_3 { ($x:expr, $c0:expr, $c1:expr, $c2:expr, $c3:expr $(,)?) => {{ let x = $x; let x2 = x * x; $c3.mul_add(x, $c2).mul_add(x2, $c1.mul_add(x, $c0)) }}; } macro_rules! polynomial_4 { ($x:expr, $c0:expr, $c1:expr, $c2:expr ,$c3:expr, $c4:expr $(,)?) => {{ let x = $x; let x2 = x * x; let x4 = x2 * x2; $c3.mul_add(x, $c2).mul_add(x2, $c1.mul_add(x, $c0)) + $c4 * x4 }}; } macro_rules! polynomial_5 { ($x:expr, $c0:expr, $c1:expr, $c2:expr, $c3:expr, $c4:expr, $c5:expr $(,)?) => {{ let x = $x; let x2 = x * x; let x4 = x2 * x2; $c3 .mul_add(x, $c2) .mul_add(x2, $c5.mul_add(x, $c4).mul_add(x4, $c1.mul_add(x, $c0))) }}; } macro_rules! polynomial_5n { ($x:expr, $c0:expr, $c1:expr, $c2:expr, $c3:expr, $c4:expr $(,)?) => {{ let x = $x; let x2 = x * x; let x4 = x2 * x2; x2.mul_add(x.mul_add($c3, $c2), (x4.mul_add($c4 + x, x.mul_add($c1, $c0)))) }}; } macro_rules! polynomial_6 { ($x:expr, $c0:expr, $c1:expr, $c2:expr, $c3:expr, $c4:expr, $c5:expr ,$c6:expr $(,)?) => {{ let x = $x; let x2 = x * x; let x4 = x2 * x2; x4.mul_add( x2.mul_add($c6, x.mul_add($c5, $c4)), x2.mul_add(x.mul_add($c3, $c2), x.mul_add($c1, $c0)), ) }}; } macro_rules! polynomial_6n { ($x:expr, $c0:expr, $c1:expr, $c2:expr, $c3:expr, $c4:expr, $c5:expr $(,)?) => {{ let x = $x; let x2 = x * x; let x4 = x2 * x2; x4.mul_add( x.mul_add($c5, x2 + $c4), x2.mul_add(x.mul_add($c3, $c2), x.mul_add($c1, $c0)), ) }}; } macro_rules! polynomial_8 { ($x:expr, $c0:expr, $c1:expr, $c2:expr, $c3:expr, $c4:expr, $c5:expr, $c6:expr, $c7:expr, $c8:expr $(,)?) => {{ let x = $x; let x2 = x * x; let x4 = x2 * x2; let x8 = x4 * x4; x4.mul_add( x2.mul_add($c7.mul_add(x, $c6), x.mul_add($c5, $c4)), x8.mul_add($c8, x2.mul_add(x.mul_add($c3, $c2), x.mul_add($c1, $c0))), ) }}; } macro_rules! polynomial_13 { // calculates polynomial c13*x^13 + c12*x^12 + ... + c1*x + c0 ($x:expr, $c2:expr, $c3:expr, $c4:expr, $c5:expr,$c6:expr, $c7:expr, $c8:expr,$c9:expr, $c10:expr, $c11:expr, $c12:expr, $c13:expr $(,)?) => {{ let x = $x; let x2 = x * x; let x4 = x2 * x2; let x8 = x4 * x4; x8.mul_add( x4.mul_add( x.mul_add($c13, $c12), x2.mul_add(x.mul_add($c11, $c10), x.mul_add($c9, $c8)), ), x4.mul_add( x2.mul_add(x.mul_add($c7, $c6), x.mul_add($c5, $c4)), x2.mul_add(x.mul_add($c3, $c2), x), ), ) }}; } macro_rules! polynomial_13m { // return ((c8+c9*x) + (c10+c11*x)*x2 + (c12+c13*x)*x4)*x8 + (((c6+c7*x)*x2 + // (c4+c5*x))*x4 + ((c2+c3*x)*x2 + x)); ($x:expr, $c2:expr, $c3:expr, $c4:expr, $c5:expr,$c6:expr, $c7:expr, $c8:expr,$c9:expr, $c10:expr, $c11:expr, $c12:expr, $c13:expr $(,)?) => {{ let x = $x; let x2 = x * x; let x4 = x2 * x2; let x8 = x4 * x4; x8.mul_add( x4.mul_add( x.mul_add($c13, $c12), x2.mul_add(x.mul_add($c11, $c10), x.mul_add($c9, $c8)), ), x4.mul_add( x2.mul_add(x.mul_add($c7, $c6), x.mul_add($c5, $c4)), x2.mul_add(x.mul_add($c3, $c2), x), ), ) }}; } mod f32x8_; pub use f32x8_::*; mod f32x4_; pub use f32x4_::*; mod f64x4_; pub use f64x4_::*; mod f64x2_; pub use f64x2_::*; mod i8x16_; pub use i8x16_::*; mod i16x16_; pub use i16x16_::*; mod i8x32_; pub use i8x32_::*; mod i16x8_; pub use i16x8_::*; mod i32x4_; pub use i32x4_::*; mod i32x8_; pub use i32x8_::*; mod i64x2_; pub use i64x2_::*; mod i64x4_; pub use i64x4_::*; mod u8x16_; pub use u8x16_::*; mod u16x8_; pub use u16x8_::*; mod u16x16_; pub use u16x16_::*; mod u32x4_; pub use u32x4_::*; mod u32x8_; pub use u32x8_::*; mod u64x2_; pub use u64x2_::*; mod u64x4_; pub use u64x4_::*; #[allow(dead_code)] fn generic_bit_blend(mask: T, y: T, n: T) -> T where T: Copy + BitXor + BitAnd, { n ^ ((n ^ y) & mask) } /// given `type.op(type)` and type is `Copy`, impls `type.op(&type)` macro_rules! bulk_impl_op_ref_self_for { ($(($op:ident, $method:ident) => [$($t:ty),+]),+ $(,)?) => { $( // do each trait/list matching given $( // do the current trait for each type in its list. impl $op<&Self> for $t { type Output = Self; #[inline] #[must_use] fn $method(self, rhs: &Self) -> Self::Output { self.$method(*rhs) } } )+ )+ }; } bulk_impl_op_ref_self_for! { (Add, add) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u16x16, u32x8, u32x4, u64x4, u64x2], (Sub, sub) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u16x16, u32x8, u32x4, u64x4, u64x2], (Mul, mul) => [f32x8, f32x4, f64x4, f64x2, i16x8, i16x16, i32x8, i32x4, u16x8, u16x16], (Div, div) => [f32x8, f32x4, f64x4, f64x2], (BitAnd, bitand) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u16x16,u32x8, u32x4, u64x4, u64x2], (BitOr, bitor) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u16x16, u32x8, u32x4, u64x4, u64x2], (BitXor, bitxor) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u16x16, u32x8, u32x4, u64x4, u64x2], } /// given `type.op(rhs)` and type is Copy, impls `type.op_assign(rhs)` macro_rules! bulk_impl_op_assign_for { ($(($op:ident<$rhs:ty>, $method:ident, $method_assign:ident) => [$($t:ty),+]),+ $(,)?) => { $( // do each trait/list matching given $( // do the current trait for each type in its list. impl $op<$rhs> for $t { #[inline] fn $method_assign(&mut self, rhs: $rhs) { *self = self.$method(rhs); } } )+ )+ }; } // Note: remember to update bulk_impl_op_ref_self_for first or this will give // weird errors! bulk_impl_op_assign_for! { (AddAssign, add, add_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u16x16, u32x8, u32x4, u64x4, u64x2], (AddAssign<&Self>, add, add_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u16x16, u32x8, u32x4, u64x4, u64x2], (SubAssign, sub, sub_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u16x16, u32x8, u32x4, u64x4, u64x2], (SubAssign<&Self>, sub, sub_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u16x16, u32x8, u32x4, u64x4, u64x2], (MulAssign, mul, mul_assign) => [f32x8, f32x4, f64x4, f64x2, i16x8, i16x16, i32x8, i32x4, u16x8, u16x16], (MulAssign<&Self>, mul, mul_assign) => [f32x8, f32x4, f64x4, f64x2, i16x8, i16x16, i32x8, i32x4, u16x8, u16x16], (DivAssign, div, div_assign) => [f32x8, f32x4, f64x4, f64x2], (DivAssign<&Self>, div, div_assign) => [f32x8, f32x4, f64x4, f64x2], (BitAndAssign, bitand, bitand_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, u16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u32x8, u32x4, u64x4, u64x2], (BitAndAssign<&Self>, bitand, bitand_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, u16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u32x8, u32x4, u64x4, u64x2], (BitOrAssign, bitor, bitor_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, u16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u32x8, u32x4, u64x4, u64x2], (BitOrAssign<&Self>, bitor, bitor_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, u16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u32x8, u32x4, u64x4, u64x2], (BitXorAssign, bitxor, bitxor_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, u16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u32x8, u32x4, u64x4, u64x2], (BitXorAssign<&Self>, bitxor, bitxor_assign) => [f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, u16x16, i32x8, i32x4, i64x2, u8x16, u16x8, u32x8, u32x4, u64x4, u64x2], } macro_rules! impl_simple_neg { ($($t:ty),+ $(,)?) => { $( impl Neg for $t { type Output = Self; #[inline] #[must_use] fn neg(self) -> Self::Output { Self::default() - self } } impl Neg for &'_ $t { type Output = $t; #[inline] #[must_use] fn neg(self) -> Self::Output { <$t>::default() - *self } } )+ }; } impl_simple_neg! { f32x8, f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x4, i64x2, u8x16, u16x8, u32x8, u32x4, u64x2, u64x4 } macro_rules! impl_simple_not { ($($t:ty),+ $(,)?) => { $( impl Not for $t { type Output = Self; #[inline] #[must_use] fn not(self) -> Self::Output { self ^ cast::(u128::MAX) } } impl Not for &'_ $t { type Output = $t; #[inline] #[must_use] fn not(self) -> Self::Output { *self ^ cast::(u128::MAX) } } )+ }; } impl_simple_not! { f32x4, i8x32, i8x16, i16x8, i16x16, i32x4, i64x2, u8x16, u16x8, u16x16, u32x4, u64x2, } macro_rules! impl_simple_sum { ($($t:ty),+ $(,)?) => { $( impl core::iter::Sum for $t where $t: AddAssign { #[inline] fn sum>(iter: I) -> Self { let mut total = Self::zeroed(); for val in iter { total += val; } total } } )+ }; } impl_simple_sum! { f32x4, f64x4, f64x2, i8x32, i8x16, i16x8, i16x16, i32x8, i32x4, i64x4, i64x2, u8x16, u16x8, u16x16, u32x8, u32x4, u64x2, u64x4 } macro_rules! impl_floating_product { ($($t:ty),+ $(,)?) => { $( impl core::iter::Product for $t where $t: MulAssign { #[inline] fn product>(iter: I) -> Self { let mut total = Self::from(1.0); for val in iter { total *= val; } total } } )+ }; } impl_floating_product! { f32x8, f32x4, f64x4, f64x2 } macro_rules! impl_integer_product { ($($t:ty),+ $(,)?) => { $( impl core::iter::Product for $t where $t: MulAssign { #[inline] fn product>(iter: I) -> Self { let mut total = Self::from(1); for val in iter { total *= val; } total } } )+ }; } impl_integer_product! { i16x8, i32x4, i32x8, } /// impls `From for b` by just calling `cast` macro_rules! impl_from_a_for_b_with_cast { ($(($arr:ty, $simd:ty)),+ $(,)?) => { $(impl From<$arr> for $simd { #[inline] #[must_use] fn from(arr: $arr) -> Self { cast(arr) } } impl From<$simd> for $arr { #[inline] #[must_use] fn from(simd: $simd) -> Self { cast(simd) } })+ }; } impl_from_a_for_b_with_cast! { ([f32;8], f32x8), ([f32;4], f32x4), ([f64;4], f64x4), ([f64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2), ([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2), ([u64;4], u64x4), } macro_rules! impl_from_single_value { ($(([$elem:ty;$len:expr], $simd:ty)),+ $(,)?) => { $(impl From<$elem> for $simd { /// Splats the single value given across all lanes. #[inline] #[must_use] fn from(elem: $elem) -> Self { cast([elem; $len]) } } impl $simd { #[inline] #[must_use] pub fn splat(elem: $elem) -> $simd { cast([elem; $len]) } })+ }; } impl_from_single_value! { ([f32;8], f32x8), ([f32;4], f32x4), ([f64;4], f64x4), ([f64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2), ([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2), ([u64;4], u64x4), } /// formatter => [(arr, simd)+],+ macro_rules! impl_formatter_for { ($($trait:ident => [$(($arr:ty, $simd:ty)),+]),+ $(,)?) => { $( // do per trait $( // do per simd type impl $trait for $simd { #[allow(clippy::missing_inline_in_public_items)] fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a: $arr = cast(*self); write!(f, "(")?; for (x, a_ref) in a.iter().enumerate() { if x > 0 { write!(f, ", ")?; } $trait::fmt(a_ref, f)?; } write!(f, ")") } } )+ )+ } } impl_formatter_for! { Binary => [([u32;8], f32x8), ([u32;4], f32x4), ([u64;4], f64x4), ([u64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2),([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2),([u64;4], u64x4)], Debug => [([f32;8], f32x8), ([f32;4], f32x4), ([f64;4], f64x4), ([f64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2),([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2),([u64;4], u64x4)], Display => [([f32;8], f32x8), ([f32;4], f32x4), ([f64;4], f64x4), ([f64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2),([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2),([u64;4], u64x4)], LowerExp => [([f32;8], f32x8), ([f32;4], f32x4), ([u64;4], f64x4), ([u64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2),([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2),([u64;4], u64x4)], LowerHex => [([u32;8], f32x8), ([u32;4], f32x4), ([u64;4], f64x4), ([u64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2),([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2),([u64;4], u64x4)], Octal => [([u32;8], f32x8), ([u32;4], f32x4), ([u64;4], f64x4), ([u64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2),([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2),([u64;4], u64x4)], UpperExp => [([u32;8], f32x8), ([u32;4], f32x4), ([u64;4], f64x4), ([u64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2),([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2),([u64;4], u64x4)], UpperHex => [([u32;8], f32x8), ([u32;4], f32x4), ([u64;4], f64x4), ([u64;2], f64x2), ([i8;32], i8x32), ([i8;16], i8x16), ([i16;8], i16x8), ([i16;16], i16x16), ([i32;8], i32x8), ([i32;4], i32x4), ([i64;2], i64x2),([i64;4], i64x4), ([u8;16], u8x16), ([u16;8], u16x8), ([u16;16], u16x16), ([u32;8], u32x8), ([u32;4], u32x4), ([u64;2], u64x2),([u64;4], u64x4)], } // With const generics this could be simplified I hope macro_rules! from_array { ($ty:ty,$dst:ty,$dst_wide:ident,32) => { impl From<&[$ty]> for $dst_wide { #[inline] fn from(src: &[$ty]) -> $dst_wide { match src.len() { 32 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst, src[23] as $dst, src[24] as $dst, src[25] as $dst, src[26] as $dst, src[27] as $dst, src[28] as $dst, src[29] as $dst, src[30] as $dst, src[31] as $dst,]), 31 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst, src[23] as $dst, src[24] as $dst, src[25] as $dst, src[26] as $dst, src[27] as $dst, src[28] as $dst, src[29] as $dst, src[30] as $dst,0 as $dst,]), 30 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst, src[23] as $dst, src[24] as $dst, src[25] as $dst, src[26] as $dst, src[27] as $dst, src[28] as $dst, src[29] as $dst,0 as $dst,0 as $dst,]), 29 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst, src[23] as $dst, src[24] as $dst, src[25] as $dst, src[26] as $dst, src[27] as $dst, src[28] as $dst,0 as $dst,0 as $dst,0 as $dst,]), 28 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst, src[23] as $dst, src[24] as $dst, src[25] as $dst, src[26] as $dst, src[27] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 27 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst, src[23] as $dst, src[24] as $dst, src[25] as $dst, src[26] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 26 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst, src[23] as $dst, src[24] as $dst, src[25] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 25 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst, src[23] as $dst, src[24] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 24 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst, src[23] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 23 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst, src[22] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 22 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst, src[21] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 21 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst, src[20] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 20 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst, src[19] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 19 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst, src[18] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 18 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst, src[17] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 17 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst, src[16] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 16 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 15 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 14 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 13 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 12 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 11 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 10 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 9 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 8 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 7 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 6 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 5 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 4 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 3 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 2 => $dst_wide::from([src[0] as $dst, src[1] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 1 => $dst_wide::from([src[0] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), _ => panic!( "Converting from an array larger than what can be stored in $dst_wide" ), } } } }; ($ty:ty,$dst:ty,$dst_wide:ident,16) => { impl From<&[$ty]> for $dst_wide { #[inline] fn from(src: &[$ty]) -> $dst_wide { match src.len() { 16 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst, src[15] as $dst,]), 15 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst, src[14] as $dst,0 as $dst,]), 14 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst, src[13] as $dst,0 as $dst,0 as $dst,]), 13 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst, src[12] as $dst,0 as $dst,0 as $dst,0 as $dst,]), 12 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst, src[11] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 11 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst, src[10] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 10 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst, src[9] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 9 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst, src[8] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 8 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 7 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 6 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 5 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 4 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 3 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 2 => $dst_wide::from([src[0] as $dst, src[1] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 1 => $dst_wide::from([src[0] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), _ => panic!( "Converting from an array larger than what can be stored in $dst_wide" ), } } } }; ($ty:ty,$dst:ty,$dst_wide:ident,8) => { impl From<&[$ty]> for $dst_wide { #[inline] fn from(src: &[$ty]) -> $dst_wide { match src.len() { 8 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst, src[7] as $dst,]), 7 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst, src[6] as $dst,0 as $dst,]), 6 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst, src[5] as $dst,0 as $dst,0 as $dst,]), 5 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst, src[4] as $dst,0 as $dst,0 as $dst,0 as $dst,]), 4 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 3 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 2 => $dst_wide::from([src[0] as $dst, src[1] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 1 => $dst_wide::from([src[0] as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), 0 => $dst_wide::from([0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,0 as $dst,]), _ => panic!( "Converting from an array larger than what can be stored in $dst_wide" ), } } } }; ($ty:ty,$dst:ty,$dst_wide:ident,4) => { impl From<&[$ty]> for $dst_wide { #[inline] fn from(src: &[$ty]) -> $dst_wide { match src.len() { 4 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst, src[3] as $dst,]), 3 => $dst_wide::from([src[0] as $dst, src[1] as $dst, src[2] as $dst,0 as $dst,]), 2 => $dst_wide::from([src[0] as $dst, src[1] as $dst,0 as $dst,0 as $dst,]), 1 => $dst_wide::from([src[0] as $dst,0 as $dst,0 as $dst,0 as $dst,]), _ => panic!( "Converting from an array larger than what can be stored in $dst_wide" ), } } } }; } from_array!(i8, i8, i8x32, 32); from_array!(i8, i8, i8x16, 16); from_array!(i8, i32, i32x8, 8); from_array!(u8, u8, u8x16, 16); from_array!(i16, i16, i16x16, 16); from_array!(u16, u16, u16x16, 16); from_array!(i32, i32, i32x8, 8); from_array!(f32, f32, f32x8, 8); from_array!(f32, f32, f32x4, 4); from_array!(f64, f64, f64x4, 4); from_array!(u64, u64, u64x4, 4); from_array!(i64, i64, i64x4, 4); #[allow(unused)] fn software_sqrt(x: f64) -> f64 { use core::num::Wrapping; type wu32 = Wrapping; const fn w(u: u32) -> wu32 { Wrapping(u) } let mut z: f64; let sign: wu32 = w(0x80000000); let mut ix0: i32; let mut s0: i32; let mut q: i32; let mut m: i32; let mut t: i32; let mut i: i32; let mut r: wu32; let mut t1: wu32; let mut s1: wu32; let mut ix1: wu32; let mut q1: wu32; // extract data pick! { if #[cfg(target_endian = "little")] { let [low, high]: [u32; 2] = cast(x); ix0 = high as i32; ix1 = w(low); } else { let [high, low]: [u32; 2] = cast(x); ix0 = high as i32; ix1 = w(low); } } // inf and nan { if x.is_nan() { return f64::NAN; } if ix0 & 0x7ff00000 == 0x7ff00000 { return x * x + x; } } // handle zero { if ix0 <= 0 { if ((ix0 & (!sign).0 as i32) | (ix1.0 as i32)) == 0 { return x; } else if ix0 < 0 { return (x - x) / (x - x); } } } // normalize { m = ix0 >> 20; if m == 0 { // subnormal while ix0 == 0 { m -= 21; ix0 |= (ix1 >> 11).0 as i32; ix1 <<= 21; } i = 0; while ix0 & 0x00100000 == 0 { ix0 <<= 1; i += 1; } m -= i - 1; ix0 |= (ix1.0 >> (31 - i)) as i32; ix1 <<= i as usize; } // un-bias exponent m -= 1023; ix0 = (ix0 & 0x000fffff) | 0x00100000; if (m & 1) != 0 { // odd m, double the input to make it even ix0 += ix0 + ((ix1 & sign) >> 31).0 as i32; ix1 += ix1; } m >>= 1; } // generate sqrt bit by bit { ix0 += ix0 + ((ix1 & sign) >> 31).0 as i32; ix1 += ix1; // q and q1 store the sqrt(x); q = 0; q1 = w(0); s0 = 0; s1 = w(0); // our bit that moves from right to left r = w(0x00200000); while r != w(0) { t = s0 + (r.0 as i32); if t <= ix0 { s0 = t + (r.0 as i32); ix0 -= t; q += (r.0 as i32); } ix0 += ix0 + ((ix1 & sign) >> 31).0 as i32; ix1 += ix1; r >>= 1; } r = sign; while r != w(0) { t1 = s1 + r; t = s0; if (t < ix0) || ((t == ix0) && (t1 <= ix1)) { s1 = t1 + r; if t1 & sign == sign && (s1 & sign) == w(0) { s0 += 1; } ix0 -= t; if ix1 < t1 { ix0 -= 1; } ix1 -= t1; q1 += r; } ix0 += ix0 + ((ix1 & sign) >> 31).0 as i32; ix1 += ix1; r >>= 1; } } // use floating add to find out rounding direction { if ix0 | (ix1.0 as i32) != 0 { z = 1.0 - 1.0e-300; if z >= 1.0 { z = 1.0 + 1.0e-300; if q1 == w(0xffffffff) { q1 = w(0); q += 1; } else if z > 1.0 { if q1 == w(0xfffffffe) { q += 1; } q1 += w(2); } else { q1 += q1 & w(1); } } } } // finish up ix0 = (q >> 1) + 0x3fe00000; ix1 = q1 >> 1; if q & 1 == 1 { ix1 |= sign; } ix0 += m << 20; pick! { if #[cfg(target_endian = "little")] { cast::<[u32; 2], f64>([ix1.0, ix0 as u32]) } else { cast::<[u32; 2], f64>([ix0 as u32, ix1.0]) } } } #[test] fn test_software_sqrt() { assert!(software_sqrt(f64::NAN).is_nan()); assert_eq!(software_sqrt(f64::INFINITY), f64::INFINITY); assert_eq!(software_sqrt(0.0), 0.0); assert_eq!(software_sqrt(-0.0), -0.0); assert!(software_sqrt(-1.0).is_nan()); assert!(software_sqrt(f64::NEG_INFINITY).is_nan()); assert_eq!(software_sqrt(4.0), 2.0); assert_eq!(software_sqrt(9.0), 3.0); assert_eq!(software_sqrt(16.0), 4.0); assert_eq!(software_sqrt(25.0), 5.0); assert_eq!(software_sqrt(5000.0 * 5000.0), 5000.0); } pub trait CmpEq { type Output; fn cmp_eq(self, rhs: Rhs) -> Self::Output; } pub trait CmpGt { type Output; fn cmp_gt(self, rhs: Rhs) -> Self::Output; } pub trait CmpGe { type Output; fn cmp_ge(self, rhs: Rhs) -> Self::Output; } pub trait CmpNe { type Output; fn cmp_ne(self, rhs: Rhs) -> Self::Output; } pub trait CmpLt { type Output; fn cmp_lt(self, rhs: Rhs) -> Self::Output; } pub trait CmpLe { type Output; fn cmp_le(self, rhs: Rhs) -> Self::Output; } macro_rules! bulk_impl_const_rhs_op { (($op:ident,$method:ident) => [$(($lhs:ty,$rhs:ty),)+]) => { $( impl $op<$rhs> for $lhs { type Output = Self; #[inline] #[must_use] fn $method(self, rhs: $rhs) -> Self::Output { self.$method(<$lhs>::splat(rhs)) } } )+ }; } bulk_impl_const_rhs_op!((CmpEq, cmp_eq) => [(f64x4, f64), (f64x2, f64), (f32x4,f32), (f32x8,f32),]); bulk_impl_const_rhs_op!((CmpLt, cmp_lt) => [(f64x4, f64), (f64x2, f64), (f32x4,f32), (f32x8,f32),]); bulk_impl_const_rhs_op!((CmpGt, cmp_gt) => [(f64x4, f64), (f64x2, f64), (f32x4,f32), (f32x8,f32),]); bulk_impl_const_rhs_op!((CmpNe, cmp_ne) => [(f64x4, f64), (f64x2, f64), (f32x4,f32), (f32x8,f32),]); bulk_impl_const_rhs_op!((CmpLe, cmp_le) => [(f64x4, f64), (f64x2, f64), (f32x4,f32), (f32x8,f32),]); bulk_impl_const_rhs_op!((CmpGe, cmp_ge) => [(f64x4, f64), (f64x2, f64), (f32x4,f32), (f32x8,f32),]); macro_rules! impl_serde { ($i:ident, [$t:ty; $len:expr]) => { #[cfg(feature = "serde")] impl Serialize for $i { #[inline] fn serialize(&self, serializer: S) -> Result where S: serde::Serializer, { let array = self.as_array_ref(); let mut seq = serializer.serialize_tuple($len)?; for e in array { seq.serialize_element(e)?; } seq.end() } } #[cfg(feature = "serde")] impl<'de> Deserialize<'de> for $i { #[inline] fn deserialize(deserializer: D) -> Result where D: serde::Deserializer<'de>, { Ok(<[$t; $len]>::deserialize(deserializer)?.into()) } } }; } impl_serde!(f32x8, [f32; 8]); impl_serde!(f32x4, [f32; 4]); impl_serde!(f64x4, [f64; 4]); impl_serde!(f64x2, [f64; 2]); impl_serde!(i8x16, [i8; 16]); impl_serde!(i16x16, [i16; 16]); impl_serde!(i8x32, [i8; 32]); impl_serde!(i16x8, [i16; 8]); impl_serde!(i32x4, [i32; 4]); impl_serde!(i32x8, [i32; 8]); impl_serde!(i64x2, [i64; 2]); impl_serde!(i64x4, [i64; 4]); impl_serde!(u8x16, [u8; 16]); impl_serde!(u16x8, [u16; 8]); impl_serde!(u16x16, [u16; 16]); impl_serde!(u32x4, [u32; 4]); impl_serde!(u32x8, [u32; 8]); impl_serde!(u64x2, [u64; 2]); impl_serde!(u64x4, [u64; 4]); wide-0.7.32/src/macros.rs000066400000000000000000000014551473735473700152050ustar00rootroot00000000000000macro_rules! int_uint_consts { ($type:ty, $lanes:expr, $simd:ty, $bits:expr) => { // ensure the size of the SIMD type is the same as the size of the array and number of bits is OK const _: () = assert!( core::mem::size_of::<$simd>() == core::mem::size_of::<[$type; $lanes]>() && core::mem::size_of::<$simd>() * 8 == $bits as usize ); impl $simd { pub const ONE: $simd = <$simd>::new([1; $lanes]); pub const ZERO: $simd = <$simd>::new([0; $lanes]); pub const MAX: $simd = <$simd>::new([<$type>::MAX; $lanes]); pub const MIN: $simd = <$simd>::new([<$type>::MIN; $lanes]); /// The number of lanes in this SIMD vector. pub const LANES: u16 = $lanes; /// The size of this SIMD vector in bits. pub const BITS: u16 = $bits; } }; } wide-0.7.32/src/u16x16_.rs000066400000000000000000000201051473735473700150230ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct u16x16 { pub(crate) avx2: m256i } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct u16x16 { pub(crate) a : u16x8, pub(crate) b : u16x8 } } } int_uint_consts!(u16, 16, u16x16, 256); unsafe impl Zeroable for u16x16 {} unsafe impl Pod for u16x16 {} impl Add for u16x16 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: add_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.add(rhs.a), b : self.b.add(rhs.b), } } } } } impl Sub for u16x16 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: sub_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.sub(rhs.a), b : self.b.sub(rhs.b), } } } } } impl Add for u16x16 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: u16) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for u16x16 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: u16) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Add for u16 { type Output = u16x16; #[inline] #[must_use] fn add(self, rhs: u16x16) -> Self::Output { u16x16::splat(self).add(rhs) } } impl Sub for u16 { type Output = u16x16; #[inline] #[must_use] fn sub(self, rhs: u16x16) -> Self::Output { u16x16::splat(self).sub(rhs) } } impl BitAnd for u16x16 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitand_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitand(rhs.a), b : self.b.bitand(rhs.b), } } } } } impl BitOr for u16x16 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitor(rhs.a), b : self.b.bitor(rhs.b), } } } } } impl BitXor for u16x16 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitxor(rhs.a), b : self.b.bitxor(rhs.b), } } } } } macro_rules! impl_shl_t_for_u16x16 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for u16x16 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shl_all_u16_m256i(self.avx2, shift) } } else { Self { a : self.a.shl(rhs), b : self.b.shl(rhs), } } } } })+ }; } impl_shl_t_for_u16x16!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_u16x16 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for u16x16 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shr_all_u16_m256i(self.avx2, shift) } } else { Self { a : self.a.shr(rhs), b : self.b.shr(rhs), } } } } })+ }; } impl_shr_t_for_u16x16!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); impl CmpEq for u16x16 { type Output = Self; #[inline] #[must_use] fn cmp_eq(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: cmp_eq_mask_i16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.cmp_eq(rhs.a), b : self.b.cmp_eq(rhs.b), } } } } } impl Mul for u16x16 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { // non-widening multiplication is the same for unsigned and signed Self { avx2: mul_i16_keep_low_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.mul(rhs.a), b : self.b.mul(rhs.b), } } } } } impl From for u16x16 { /// widens and sign extends to u16x16 #[inline] #[must_use] fn from(v: u8x16) -> Self { pick! { if #[cfg(target_feature="avx2")] { u16x16 { avx2:convert_to_i16_m256i_from_u8_m128i(v.sse) } } else if #[cfg(target_feature="sse2")] { u16x16 { a: u16x8 { sse: shr_imm_u16_m128i::<8>( unpack_low_i8_m128i(v.sse, v.sse)) }, b: u16x8 { sse: shr_imm_u16_m128i::<8>( unpack_high_i8_m128i(v.sse, v.sse)) }, } } else { u16x16::new([ v.as_array_ref()[0] as u16, v.as_array_ref()[1] as u16, v.as_array_ref()[2] as u16, v.as_array_ref()[3] as u16, v.as_array_ref()[4] as u16, v.as_array_ref()[5] as u16, v.as_array_ref()[6] as u16, v.as_array_ref()[7] as u16, v.as_array_ref()[8] as u16, v.as_array_ref()[9] as u16, v.as_array_ref()[10] as u16, v.as_array_ref()[11] as u16, v.as_array_ref()[12] as u16, v.as_array_ref()[13] as u16, v.as_array_ref()[14] as u16, v.as_array_ref()[15] as u16, ]) } } } } impl u16x16 { #[inline] #[must_use] pub const fn new(array: [u16; 16]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: blend_varying_i8_m256i(f.avx2, t.avx2, self.avx2) } } else { Self { a : self.a.blend(t.a, f.a), b : self.b.blend(t.b, f.b), } } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: max_u16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.max(rhs.a), b : self.b.max(rhs.b), } } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: min_u16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.min(rhs.a), b : self.b.min(rhs.b), } } } } #[inline] #[must_use] pub fn saturating_add(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: add_saturating_u16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.saturating_add(rhs.a), b : self.b.saturating_add(rhs.b), } } } } #[inline] #[must_use] pub fn saturating_sub(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: sub_saturating_u16_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.saturating_sub(rhs.a), b : self.b.saturating_sub(rhs.b), } } } } #[inline] pub fn to_array(self) -> [u16; 16] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[u16; 16] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [u16; 16] { cast_mut(self) } } wide-0.7.32/src/u16x8_.rs000066400000000000000000000474301473735473700147560ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct u16x8 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct u16x8 { pub(crate) simd: v128 } impl Default for u16x8 { fn default() -> Self { Self::splat(0) } } impl PartialEq for u16x8 { fn eq(&self, other: &Self) -> bool { u16x8_all_true(u16x8_eq(self.simd, other.simd)) } } impl Eq for u16x8 { } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct u16x8 { pub(crate) neon : uint16x8_t } impl Default for u16x8 { #[inline] #[must_use] fn default() -> Self { Self::splat(0) } } impl PartialEq for u16x8 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { vminvq_u16(vceqq_u16(self.neon, other.neon))==u16::MAX } } } impl Eq for u16x8 { } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct u16x8 { pub(crate) arr: [u16;8] } } } int_uint_consts!(u16, 8, u16x8, 128); unsafe impl Zeroable for u16x8 {} unsafe impl Pod for u16x8 {} impl Add for u16x8 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_u16(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_add(rhs.arr[0]), self.arr[1].wrapping_add(rhs.arr[1]), self.arr[2].wrapping_add(rhs.arr[2]), self.arr[3].wrapping_add(rhs.arr[3]), self.arr[4].wrapping_add(rhs.arr[4]), self.arr[5].wrapping_add(rhs.arr[5]), self.arr[6].wrapping_add(rhs.arr[6]), self.arr[7].wrapping_add(rhs.arr[7]), ]} } } } } impl Sub for u16x8 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vsubq_u16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].wrapping_sub(rhs.arr[0]), self.arr[1].wrapping_sub(rhs.arr[1]), self.arr[2].wrapping_sub(rhs.arr[2]), self.arr[3].wrapping_sub(rhs.arr[3]), self.arr[4].wrapping_sub(rhs.arr[4]), self.arr[5].wrapping_sub(rhs.arr[5]), self.arr[6].wrapping_sub(rhs.arr[6]), self.arr[7].wrapping_sub(rhs.arr[7]), ]} } } } } impl Mul for u16x8 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: mul_i16_keep_low_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_mul(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmulq_u16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].wrapping_mul(rhs.arr[0]), self.arr[1].wrapping_mul(rhs.arr[1]), self.arr[2].wrapping_mul(rhs.arr[2]), self.arr[3].wrapping_mul(rhs.arr[3]), self.arr[4].wrapping_mul(rhs.arr[4]), self.arr[5].wrapping_mul(rhs.arr[5]), self.arr[6].wrapping_mul(rhs.arr[6]), self.arr[7].wrapping_mul(rhs.arr[7]), ]} } } } } impl Add for u16x8 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: u16) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for u16x8 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: u16) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for u16x8 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: u16) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for u16 { type Output = u16x8; #[inline] #[must_use] fn add(self, rhs: u16x8) -> Self::Output { u16x8::splat(self).add(rhs) } } impl Sub for u16 { type Output = u16x8; #[inline] #[must_use] fn sub(self, rhs: u16x8) -> Self::Output { u16x8::splat(self).sub(rhs) } } impl Mul for u16 { type Output = u16x8; #[inline] #[must_use] fn mul(self, rhs: u16x8) -> Self::Output { u16x8::splat(self).mul(rhs) } } impl BitAnd for u16x8 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitand_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vandq_u16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitand(rhs.arr[0]), self.arr[1].bitand(rhs.arr[1]), self.arr[2].bitand(rhs.arr[2]), self.arr[3].bitand(rhs.arr[3]), self.arr[4].bitand(rhs.arr[4]), self.arr[5].bitand(rhs.arr[5]), self.arr[6].bitand(rhs.arr[6]), self.arr[7].bitand(rhs.arr[7]), ]} } } } } impl BitOr for u16x8 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vorrq_u16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitor(rhs.arr[0]), self.arr[1].bitor(rhs.arr[1]), self.arr[2].bitor(rhs.arr[2]), self.arr[3].bitor(rhs.arr[3]), self.arr[4].bitor(rhs.arr[4]), self.arr[5].bitor(rhs.arr[5]), self.arr[6].bitor(rhs.arr[6]), self.arr[7].bitor(rhs.arr[7]), ]} } } } } impl BitXor for u16x8 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitxor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: veorq_u16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitxor(rhs.arr[0]), self.arr[1].bitxor(rhs.arr[1]), self.arr[2].bitxor(rhs.arr[2]), self.arr[3].bitxor(rhs.arr[3]), self.arr[4].bitxor(rhs.arr[4]), self.arr[5].bitxor(rhs.arr[5]), self.arr[6].bitxor(rhs.arr[6]), self.arr[7].bitxor(rhs.arr[7]), ]} } } } } macro_rules! impl_shl_t_for_u16x8 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for u16x8 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shl_all_u16_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_shl(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_u16(self.neon, vmovq_n_s16(rhs as i16)) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] << u, self.arr[1] << u, self.arr[2] << u, self.arr[3] << u, self.arr[4] << u, self.arr[5] << u, self.arr[6] << u, self.arr[7] << u, ]} } } } })+ }; } impl_shl_t_for_u16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_u16x8 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for u16x8 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shr_all_u16_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_shr(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_u16(self.neon, vmovq_n_s16( -(rhs as i16))) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] >> u, self.arr[1] >> u, self.arr[2] >> u, self.arr[3] >> u, self.arr[4] >> u, self.arr[5] >> u, self.arr[6] >> u, self.arr[7] >> u, ]} } } } })+ }; } impl_shr_t_for_u16x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); impl u16x8 { #[inline] #[must_use] pub const fn new(array: [u16; 8]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn cmp_eq(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_eq_mask_i16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vceqq_u16(self.neon, rhs.neon) }} } else { Self { arr: [ if self.arr[0] == rhs.arr[0] { u16::MAX } else { 0 }, if self.arr[1] == rhs.arr[1] { u16::MAX } else { 0 }, if self.arr[2] == rhs.arr[2] { u16::MAX } else { 0 }, if self.arr[3] == rhs.arr[3] { u16::MAX } else { 0 }, if self.arr[4] == rhs.arr[4] { u16::MAX } else { 0 }, if self.arr[5] == rhs.arr[5] { u16::MAX } else { 0 }, if self.arr[6] == rhs.arr[6] { u16::MAX } else { 0 }, if self.arr[7] == rhs.arr[7] { u16::MAX } else { 0 }, ]} } } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vbslq_u16(self.neon, t.neon, f.neon) }} } else { generic_bit_blend(self, t, f) } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: max_u16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_max(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxq_u16(self.neon, rhs.neon) }} } else { let arr: [u16; 8] = cast(self); let rhs: [u16; 8] = cast(rhs); cast([ arr[0].max(rhs[0]), arr[1].max(rhs[1]), arr[2].max(rhs[2]), arr[3].max(rhs[3]), arr[4].max(rhs[4]), arr[5].max(rhs[5]), arr[6].max(rhs[6]), arr[7].max(rhs[7]), ]) } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: min_u16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_min(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminq_u16(self.neon, rhs.neon) }} } else { let arr: [u16; 8] = cast(self); let rhs: [u16; 8] = cast(rhs); cast([ arr[0].min(rhs[0]), arr[1].min(rhs[1]), arr[2].min(rhs[2]), arr[3].min(rhs[3]), arr[4].min(rhs[4]), arr[5].min(rhs[5]), arr[6].min(rhs[6]), arr[7].min(rhs[7]), ]) } } } #[inline] #[must_use] pub fn saturating_add(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_saturating_u16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_add_sat(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vqaddq_u16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].saturating_add(rhs.arr[0]), self.arr[1].saturating_add(rhs.arr[1]), self.arr[2].saturating_add(rhs.arr[2]), self.arr[3].saturating_add(rhs.arr[3]), self.arr[4].saturating_add(rhs.arr[4]), self.arr[5].saturating_add(rhs.arr[5]), self.arr[6].saturating_add(rhs.arr[6]), self.arr[7].saturating_add(rhs.arr[7]), ]} } } } #[inline] #[must_use] pub fn saturating_sub(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_saturating_u16_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u16x8_sub_sat(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vqsubq_u16(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].saturating_sub(rhs.arr[0]), self.arr[1].saturating_sub(rhs.arr[1]), self.arr[2].saturating_sub(rhs.arr[2]), self.arr[3].saturating_sub(rhs.arr[3]), self.arr[4].saturating_sub(rhs.arr[4]), self.arr[5].saturating_sub(rhs.arr[5]), self.arr[6].saturating_sub(rhs.arr[6]), self.arr[7].saturating_sub(rhs.arr[7]), ]} } } } /// Unpack the lower half of the input and zero expand it to `u16` values. #[inline] #[must_use] pub fn from_u8x16_low(u: u8x16) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self{ sse: unpack_low_i8_m128i(u.sse, m128i::zeroed()) } } else { let u_arr: [u8; 16] = cast(u); cast([ u_arr[0] as u16, u_arr[1] as u16, u_arr[2] as u16, u_arr[3] as u16, u_arr[4] as u16, u_arr[5] as u16, u_arr[6] as u16, u_arr[7] as u16, ]) } } } /// Unpack the upper half of the input and zero expand it to `u16` values. #[inline] #[must_use] pub fn from_u8x16_high(u: u8x16) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self{ sse: unpack_high_i8_m128i(u.sse, m128i::zeroed()) } } else { let u_arr: [u8; 16] = cast(u); cast([ u_arr[8] as u16, u_arr[9] as u16, u_arr[10] as u16, u_arr[11] as u16, u_arr[12] as u16, u_arr[13] as u16, u_arr[14] as u16, u_arr[15] as u16, ]) } } } /// multiplies two u16x8 and returns the result as a widened u32x8 #[inline] #[must_use] pub fn mul_widen(self, rhs: Self) -> u32x8 { pick! { if #[cfg(target_feature="avx2")] { let a = convert_to_i32_m256i_from_u16_m128i(self.sse); let b = convert_to_i32_m256i_from_u16_m128i(rhs.sse); u32x8 { avx2: mul_i32_keep_low_m256i(a,b) } } else if #[cfg(target_feature="sse2")] { let low = mul_i16_keep_low_m128i(self.sse, rhs.sse); let high = mul_u16_keep_high_m128i(self.sse, rhs.sse); u32x8 { a: u32x4 { sse:unpack_low_i16_m128i(low, high) }, b: u32x4 { sse:unpack_high_i16_m128i(low, high) } } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { let lhs_low = unsafe { vget_low_u16(self.neon) }; let rhs_low = unsafe { vget_low_u16(rhs.neon) }; let lhs_high = unsafe { vget_high_u16(self.neon) }; let rhs_high = unsafe { vget_high_u16(rhs.neon) }; let low = unsafe { vmull_u16(lhs_low, rhs_low) }; let high = unsafe { vmull_u16(lhs_high, rhs_high) }; u32x8 { a: u32x4 { neon: low }, b: u32x4 {neon: high } } } else { let a = self.as_array_ref(); let b = rhs.as_array_ref(); u32x8::new([ u32::from(a[0]) * u32::from(b[0]), u32::from(a[1]) * u32::from(b[1]), u32::from(a[2]) * u32::from(b[2]), u32::from(a[3]) * u32::from(b[3]), u32::from(a[4]) * u32::from(b[4]), u32::from(a[5]) * u32::from(b[5]), u32::from(a[6]) * u32::from(b[6]), u32::from(a[7]) * u32::from(b[7]), ]) } } } /// Multiples two `u16x8` and return the high part of intermediate `u32x8` #[inline] #[must_use] pub fn mul_keep_high(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: mul_u16_keep_high_m128i(self.sse, rhs.sse) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { let lhs_low = unsafe { vget_low_u16(self.neon) }; let rhs_low = unsafe { vget_low_u16(rhs.neon) }; let lhs_high = unsafe { vget_high_u16(self.neon) }; let rhs_high = unsafe { vget_high_u16(rhs.neon) }; let low = unsafe { vmull_u16(lhs_low, rhs_low) }; let high = unsafe { vmull_u16(lhs_high, rhs_high) }; u16x8 { neon: unsafe { vuzpq_u16(vreinterpretq_u16_u32(low), vreinterpretq_u16_u32(high)).1 } } } else if #[cfg(target_feature="simd128")] { let low = u32x4_extmul_low_u16x8(self.simd, rhs.simd); let high = u32x4_extmul_high_u16x8(self.simd, rhs.simd); Self { simd: u16x8_shuffle::<1, 3, 5, 7, 9, 11, 13, 15>(low, high) } } else { u16x8::new([ ((u32::from(rhs.as_array_ref()[0]) * u32::from(self.as_array_ref()[0])) >> 16) as u16, ((u32::from(rhs.as_array_ref()[1]) * u32::from(self.as_array_ref()[1])) >> 16) as u16, ((u32::from(rhs.as_array_ref()[2]) * u32::from(self.as_array_ref()[2])) >> 16) as u16, ((u32::from(rhs.as_array_ref()[3]) * u32::from(self.as_array_ref()[3])) >> 16) as u16, ((u32::from(rhs.as_array_ref()[4]) * u32::from(self.as_array_ref()[4])) >> 16) as u16, ((u32::from(rhs.as_array_ref()[5]) * u32::from(self.as_array_ref()[5])) >> 16) as u16, ((u32::from(rhs.as_array_ref()[6]) * u32::from(self.as_array_ref()[6])) >> 16) as u16, ((u32::from(rhs.as_array_ref()[7]) * u32::from(self.as_array_ref()[7])) >> 16) as u16, ]) } } } #[inline] pub fn to_array(self) -> [u16; 8] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[u16; 8] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [u16; 8] { cast_mut(self) } } wide-0.7.32/src/u32x4_.rs000066400000000000000000000475161473735473700147550ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct u32x4 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct u32x4 { pub(crate) simd: v128 } impl Default for u32x4 { fn default() -> Self { Self::splat(0) } } impl PartialEq for u32x4 { fn eq(&self, other: &Self) -> bool { u32x4_all_true(u32x4_eq(self.simd, other.simd)) } } impl Eq for u32x4 { } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct u32x4 { pub(crate) neon : uint32x4_t } impl Default for u32x4 { #[inline] #[must_use] fn default() -> Self { Self::splat(0) } } impl PartialEq for u32x4 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { vminvq_u32(vceqq_u32(self.neon, other.neon))==u32::MAX } } } impl Eq for u32x4 { } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct u32x4 { arr: [u32;4] } } } int_uint_consts!(u32, 4, u32x4, 128); unsafe impl Zeroable for u32x4 {} unsafe impl Pod for u32x4 {} impl Add for u32x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u32x4_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_u32(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_add(rhs.arr[0]), self.arr[1].wrapping_add(rhs.arr[1]), self.arr[2].wrapping_add(rhs.arr[2]), self.arr[3].wrapping_add(rhs.arr[3]), ]} } } } } impl Sub for u32x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u32x4_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vsubq_u32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].wrapping_sub(rhs.arr[0]), self.arr[1].wrapping_sub(rhs.arr[1]), self.arr[2].wrapping_sub(rhs.arr[2]), self.arr[3].wrapping_sub(rhs.arr[3]), ]} } } } } impl Mul for u32x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: mul_32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u32x4_mul(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmulq_u32(self.neon, rhs.neon) }} } else { let arr1: [u32; 4] = cast(self); let arr2: [u32; 4] = cast(rhs); cast([ arr1[0].wrapping_mul(arr2[0]), arr1[1].wrapping_mul(arr2[1]), arr1[2].wrapping_mul(arr2[2]), arr1[3].wrapping_mul(arr2[3]), ]) } } } } impl Add for u32x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: u32) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for u32x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: u32) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for u32x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: u32) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for u32 { type Output = u32x4; #[inline] #[must_use] fn add(self, rhs: u32x4) -> Self::Output { u32x4::splat(self).add(rhs) } } impl Sub for u32 { type Output = u32x4; #[inline] #[must_use] fn sub(self, rhs: u32x4) -> Self::Output { u32x4::splat(self).sub(rhs) } } impl Mul for u32 { type Output = u32x4; #[inline] #[must_use] fn mul(self, rhs: u32x4) -> Self::Output { u32x4::splat(self).mul(rhs) } } impl BitAnd for u32x4 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitand_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vandq_u32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitand(rhs.arr[0]), self.arr[1].bitand(rhs.arr[1]), self.arr[2].bitand(rhs.arr[2]), self.arr[3].bitand(rhs.arr[3]), ]} } } } } impl BitOr for u32x4 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vorrq_u32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitor(rhs.arr[0]), self.arr[1].bitor(rhs.arr[1]), self.arr[2].bitor(rhs.arr[2]), self.arr[3].bitor(rhs.arr[3]), ]} } } } } impl BitXor for u32x4 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitxor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: veorq_u32(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitxor(rhs.arr[0]), self.arr[1].bitxor(rhs.arr[1]), self.arr[2].bitxor(rhs.arr[2]), self.arr[3].bitxor(rhs.arr[3]), ]} } } } } macro_rules! impl_shl_t_for_u32x4 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for u32x4 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shl_all_u32_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: u32x4_shl(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_u32(self.neon, vmovq_n_s32(rhs as i32)) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] << u, self.arr[1] << u, self.arr[2] << u, self.arr[3] << u, ]} } } } })+ }; } impl_shl_t_for_u32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_u32x4 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for u32x4 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shr_all_u32_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: u32x4_shr(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_u32(self.neon, vmovq_n_s32( -(rhs as i32))) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] >> u, self.arr[1] >> u, self.arr[2] >> u, self.arr[3] >> u, ]} } } } })+ }; } impl_shr_t_for_u32x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); /// Shifts lanes by the corresponding lane. /// /// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth /// of the type. (same as `wrapping_shr`) impl Shr for u32x4 { type Output = Self; #[inline] #[must_use] fn shr(self, rhs: u32x4) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { // mask the shift count to 31 to have same behavior on all platforms let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31)); Self { sse: shr_each_u32_m128i(self.sse, shift_by) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { // mask the shift count to 31 to have same behavior on all platforms // no right shift, have to pass negative value to left shift on neon let shift_by = vnegq_s32(vreinterpretq_s32_u32(vandq_u32(rhs.neon, vmovq_n_u32(31)))); Self { neon: vshlq_u32(self.neon, shift_by) } } } else { let arr: [u32; 4] = cast(self); let rhs: [u32; 4] = cast(rhs); cast([ arr[0].wrapping_shr(rhs[0]), arr[1].wrapping_shr(rhs[1]), arr[2].wrapping_shr(rhs[2]), arr[3].wrapping_shr(rhs[3]), ]) } } } } /// Shifts lanes by the corresponding lane. /// /// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth /// of the type. (same as `wrapping_shl`) impl Shl for u32x4 { type Output = Self; #[inline] #[must_use] fn shl(self, rhs: u32x4) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { // mask the shift count to 31 to have same behavior on all platforms let shift_by = bitand_m128i(rhs.sse, set_splat_i32_m128i(31)); Self { sse: shl_each_u32_m128i(self.sse, shift_by) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { // mask the shift count to 31 to have same behavior on all platforms let shift_by = vreinterpretq_s32_u32(vandq_u32(rhs.neon, vmovq_n_u32(31))); Self { neon: vshlq_u32(self.neon, shift_by) } } } else { let arr: [u32; 4] = cast(self); let rhs: [u32; 4] = cast(rhs); cast([ arr[0].wrapping_shl(rhs[0]), arr[1].wrapping_shl(rhs[1]), arr[2].wrapping_shl(rhs[2]), arr[3].wrapping_shl(rhs[3]), ]) } } } } impl u32x4 { #[inline] #[must_use] pub const fn new(array: [u32; 4]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn cmp_eq(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_eq_mask_i32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u32x4_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vceqq_u32(self.neon, rhs.neon) }} } else { Self { arr: [ if self.arr[0] == rhs.arr[0] { u32::MAX } else { 0 }, if self.arr[1] == rhs.arr[1] { u32::MAX } else { 0 }, if self.arr[2] == rhs.arr[2] { u32::MAX } else { 0 }, if self.arr[3] == rhs.arr[3] { u32::MAX } else { 0 }, ]} } } } #[inline] #[must_use] pub fn cmp_gt(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { // no unsigned less than so inverting the high bit will get the correct result let h = u32x4::splat(1 << 31); Self { sse: cmp_gt_mask_i32_m128i((self ^ h).sse, (rhs ^ h).sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u32x4_gt(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe {Self { neon: vcgtq_u32(self.neon, rhs.neon) }} } else { Self { arr: [ if self.arr[0] > rhs.arr[0] { u32::MAX } else { 0 }, if self.arr[1] > rhs.arr[1] { u32::MAX } else { 0 }, if self.arr[2] > rhs.arr[2] { u32::MAX } else { 0 }, if self.arr[3] > rhs.arr[3] { u32::MAX } else { 0 }, ]} } } } #[inline] #[must_use] pub fn cmp_lt(self, rhs: Self) -> Self { // lt is just gt the other way around rhs.cmp_gt(self) } /// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the /// result. Useful for implementing divide constant value (see t_usefulness /// example) #[inline] #[must_use] pub fn mul_keep_high(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { let a = convert_to_i64_m256i_from_u32_m128i(self.sse); let b = convert_to_i64_m256i_from_u32_m128i(rhs.sse); let r = mul_u64_low_bits_m256i(a, b); // the compiler does a good job shuffling the lanes around let b : [u32;8] = cast(r); cast([b[1],b[3],b[5],b[7]]) } else if #[cfg(target_feature="sse2")] { let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse); let oddp = mul_widen_u32_odd_m128i( shr_imm_u64_m128i::<32>(self.sse), shr_imm_u64_m128i::<32>(rhs.sse)); // the compiler does a good job shuffling the lanes around let a : [u32;4]= cast(evenp); let b : [u32;4]= cast(oddp); cast([a[1],b[1],a[3],b[3]]) } else if #[cfg(target_feature="simd128")] { let low = u64x2_extmul_low_u32x4(self.simd, rhs.simd); let high = u64x2_extmul_high_u32x4(self.simd, rhs.simd); Self { simd: u32x4_shuffle::<1, 3, 5, 7>(low, high) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe { let l = vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon)); let h = vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon)); u32x4 { neon: vcombine_u32(vshrn_n_u64(l,32), vshrn_n_u64(h,32)) } } } else { let a: [u32; 4] = cast(self); let b: [u32; 4] = cast(rhs); cast([ ((u64::from(a[0]) * u64::from(b[0])) >> 32) as u32, ((u64::from(a[1]) * u64::from(b[1])) >> 32) as u32, ((u64::from(a[2]) * u64::from(b[2])) >> 32) as u32, ((u64::from(a[3]) * u64::from(b[3])) >> 32) as u32, ]) } } } /// Multiplies corresponding 32 bit lanes and returns the 64 bit result /// on the corresponding lanes. /// /// Effectively does two multiplies on 128 bit platforms, but is easier /// to use than wrapping mul_widen_u32_odd_m128i individually. #[inline] #[must_use] pub fn mul_widen(self, rhs: Self) -> u64x4 { pick! { if #[cfg(target_feature="avx2")] { // ok to sign extend since we are throwing away the high half of the result anyway let a = convert_to_i64_m256i_from_i32_m128i(self.sse); let b = convert_to_i64_m256i_from_i32_m128i(rhs.sse); cast(mul_u64_low_bits_m256i(a, b)) } else if #[cfg(target_feature="sse2")] { let evenp = mul_widen_u32_odd_m128i(self.sse, rhs.sse); let oddp = mul_widen_u32_odd_m128i( shr_imm_u64_m128i::<32>(self.sse), shr_imm_u64_m128i::<32>(rhs.sse)); u64x4 { a: u64x2 { sse: unpack_low_i64_m128i(evenp, oddp)}, b: u64x2 { sse: unpack_high_i64_m128i(evenp, oddp)} } } else if #[cfg(target_feature="simd128")] { u64x4 { a: u64x2 { simd: u64x2_extmul_low_u32x4(self.simd, rhs.simd) }, b: u64x2 { simd: u64x2_extmul_high_u32x4(self.simd, rhs.simd) }, } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))] { unsafe { u64x4 { a: u64x2 { neon: vmull_u32(vget_low_u32(self.neon), vget_low_u32(rhs.neon)) }, b: u64x2 { neon: vmull_u32(vget_high_u32(self.neon), vget_high_u32(rhs.neon)) } } } } else { let a: [u32; 4] = cast(self); let b: [u32; 4] = cast(rhs); cast([ u64::from(a[0]) * u64::from(b[0]), u64::from(a[1]) * u64::from(b[1]), u64::from(a[2]) * u64::from(b[2]), u64::from(a[3]) * u64::from(b[3]), ]) } } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vbslq_u32(self.neon, t.neon, f.neon) }} } else { generic_bit_blend(self, t, f) } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: max_u32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u32x4_max(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxq_u32(self.neon, rhs.neon) }} } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxq_u16(self.neon, rhs.neon) }} } else { let arr: [u32; 4] = cast(self); let rhs: [u32; 4] = cast(rhs); cast([ arr[0].max(rhs[0]), arr[1].max(rhs[1]), arr[2].max(rhs[2]), arr[3].max(rhs[3]), ]) } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: min_u32_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u32x4_min(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminq_u32(self.neon, rhs.neon) }} } else { let arr: [u32; 4] = cast(self); let rhs: [u32; 4] = cast(rhs); cast([ arr[0].min(rhs[0]), arr[1].min(rhs[1]), arr[2].min(rhs[2]), arr[3].min(rhs[3]), ]) } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="sse2")] { (move_mask_i8_m128i(self.sse) & 0b1000100010001000) != 0 } else if #[cfg(target_feature="simd128")] { u32x4_bitmask(self.simd) != 0 } else { let v : [u64;2] = cast(self); ((v[0] | v[1]) & 0x8000000080000000) != 0 } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="sse2")] { (move_mask_i8_m128i(self.sse) & 0b1000100010001000) == 0b1000100010001000 } else if #[cfg(target_feature="simd128")] { u32x4_bitmask(self.simd) == 0b1111 } else { let v : [u64;2] = cast(self); (v[0] & v[1] & 0x8000000080000000) == 0x8000000080000000 } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] pub fn to_array(self) -> [u32; 4] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[u32; 4] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [u32; 4] { cast_mut(self) } } wide-0.7.32/src/u32x8_.rs000066400000000000000000000243631473735473700147540ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct u32x8 { pub(crate) avx2: m256i } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct u32x8 { pub(crate) a : u32x4, pub(crate) b : u32x4 } } } int_uint_consts!(u32, 8, u32x8, 256); unsafe impl Zeroable for u32x8 {} unsafe impl Pod for u32x8 {} impl Add for u32x8 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: add_i32_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.add(rhs.a), b : self.b.add(rhs.b), } } } } } impl Sub for u32x8 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: sub_i32_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.sub(rhs.a), b : self.b.sub(rhs.b), } } } } } impl Mul for u32x8 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: mul_i32_keep_low_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.mul(rhs.a), b : self.b.mul(rhs.b), } } } } } impl BitAnd for u32x8 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitand_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitand(rhs.a), b : self.b.bitand(rhs.b), } } } } } impl BitOr for u32x8 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitor(rhs.a), b : self.b.bitor(rhs.b), } } } } } impl BitXor for u32x8 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitxor(rhs.a), b : self.b.bitxor(rhs.b), } } } } } impl From for u32x8 { /// widens and zero extends to u32x8 #[inline] #[must_use] fn from(v: u16x8) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2:convert_to_i32_m256i_from_u16_m128i(v.sse) } } else if #[cfg(target_feature="sse2")] { Self { a: u32x4 { sse: shr_imm_u32_m128i::<16>( unpack_low_i16_m128i(v.sse, v.sse)) }, b: u32x4 { sse: shr_imm_u32_m128i::<16>( unpack_high_i16_m128i(v.sse, v.sse)) }, } } else { u32x8::new([ u32::from(v.as_array_ref()[0]), u32::from(v.as_array_ref()[1]), u32::from(v.as_array_ref()[2]), u32::from(v.as_array_ref()[3]), u32::from(v.as_array_ref()[4]), u32::from(v.as_array_ref()[5]), u32::from(v.as_array_ref()[6]), u32::from(v.as_array_ref()[7]), ]) } } } } macro_rules! impl_shl_t_for_u32x8 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for u32x8 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shl_all_u32_m256i(self.avx2, shift) } } else { Self { a : self.a.shl(rhs), b : self.b.shl(rhs), } } } } })+ }; } impl_shl_t_for_u32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_u32x8 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for u32x8 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shr_all_u32_m256i(self.avx2, shift) } } else { Self { a : self.a.shr(rhs), b : self.b.shr(rhs), } } } } })+ }; } impl_shr_t_for_u32x8!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); /// Shifts lanes by the corresponding lane. /// /// Bitwise shift-right; yields `self >> mask(rhs)`, where mask removes any /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth /// of the type. (same as `wrapping_shr`) impl Shr for u32x8 { type Output = Self; #[inline] #[must_use] fn shr(self, rhs: u32x8) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { // ensure same behavior as scalar wrapping_shr let shift_by = bitand_m256i(rhs.avx2, set_splat_i32_m256i(31)); Self { avx2: shr_each_u32_m256i(self.avx2, shift_by ) } } else { Self { a : self.a.shr(rhs.a), b : self.b.shr(rhs.b), } } } } } /// Shifts lanes by the corresponding lane. /// /// Bitwise shift-left; yields `self << mask(rhs)`, where mask removes any /// high-order bits of `rhs` that would cause the shift to exceed the bitwidth /// of the type. (same as `wrapping_shl`) impl Shl for u32x8 { type Output = Self; #[inline] #[must_use] fn shl(self, rhs: u32x8) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { // ensure same behavior as scalar wrapping_shl let shift_by = bitand_m256i(rhs.avx2, set_splat_i32_m256i(31)); Self { avx2: shl_each_u32_m256i(self.avx2, shift_by) } } else { Self { a : self.a.shl(rhs.a), b : self.b.shl(rhs.b), } } } } } impl u32x8 { #[inline] #[must_use] pub const fn new(array: [u32; 8]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn cmp_eq(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: cmp_eq_mask_i32_m256i(self.avx2, rhs.avx2 ) } } else { Self { a : self.a.cmp_eq(rhs.a), b : self.b.cmp_eq(rhs.b), } } } } #[inline] #[must_use] pub fn cmp_gt(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { // no unsigned gt than so inverting the high bit will get the correct result let highbit = u32x8::splat(1 << 31); Self { avx2: cmp_gt_mask_i32_m256i((self ^ highbit).avx2, (rhs ^ highbit).avx2 ) } } else { Self { a : self.a.cmp_gt(rhs.a), b : self.b.cmp_gt(rhs.b), } } } } #[inline] #[must_use] pub fn cmp_lt(self, rhs: Self) -> Self { // lt is just gt the other way around rhs.cmp_gt(self) } /// Multiplies 32x32 bit to 64 bit and then only keeps the high 32 bits of the /// result. Useful for implementing divide constant value (see t_usefulness /// example) #[inline] #[must_use] pub fn mul_keep_high(self, rhs: u32x8) -> u32x8 { pick! { if #[cfg(target_feature="avx2")] { let a : [u32;8]= cast(self); let b : [u32;8]= cast(rhs); // let the compiler shuffle the values around, it does the right thing let r1 : [u32;8] = cast(mul_u64_low_bits_m256i(cast([a[0], 0, a[1], 0, a[2], 0, a[3], 0]), cast([b[0], 0, b[1], 0, b[2], 0, b[3], 0]))); let r2 : [u32;8] = cast(mul_u64_low_bits_m256i(cast([a[4], 0, a[5], 0, a[6], 0, a[7], 0]), cast([b[4], 0, b[5], 0, b[6], 0, b[7], 0]))); cast([r1[1], r1[3], r1[5], r1[7], r2[1], r2[3], r2[5], r2[7]]) } else { Self { a : self.a.mul_keep_high(rhs.a), b : self.b.mul_keep_high(rhs.b), } } } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: blend_varying_i8_m256i(f.avx2, t.avx2, self.avx2) } } else { Self { a : self.a.blend(t.a, f.a), b : self.b.blend(t.b, f.b), } } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: max_u32_m256i(self.avx2, rhs.avx2 ) } } else { Self { a : self.a.max(rhs.a), b : self.b.max(rhs.b), } } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: min_u32_m256i(self.avx2, rhs.avx2 ) } } else { Self { a : self.a.min(rhs.a), b : self.b.min(rhs.b), } } } } #[inline] #[must_use] pub fn any(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { ((move_mask_i8_m256i(self.avx2) as u32) & 0b10001000100010001000100010001000) != 0 } else { (self.a | self.b).any() } } } #[inline] #[must_use] pub fn all(self) -> bool { pick! { if #[cfg(target_feature="avx2")] { ((move_mask_i8_m256i(self.avx2) as u32) & 0b10001000100010001000100010001000) == 0b10001000100010001000100010001000 } else { (self.a & self.b).all() } } } #[inline] #[must_use] pub fn none(self) -> bool { !self.any() } #[inline] pub fn to_array(self) -> [u32; 8] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[u32; 8] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [u32; 8] { cast_mut(self) } } impl Not for u32x8 { type Output = Self; #[inline] fn not(self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: self.avx2.not() } } else { Self { a : self.a.not(), b : self.b.not(), } } } } } wide-0.7.32/src/u64x2_.rs000066400000000000000000000245011473735473700147450ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct u64x2 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct u64x2 { pub(crate) simd: v128 } impl Default for u64x2 { fn default() -> Self { Self::splat(0) } } impl PartialEq for u64x2 { fn eq(&self, other: &Self) -> bool { u64x2_all_true(u64x2_eq(self.simd, other.simd)) } } impl Eq for u64x2 { } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct u64x2 { pub(crate) neon : uint64x2_t } impl Default for u64x2 { #[inline] #[must_use] fn default() -> Self { unsafe { Self { neon: vdupq_n_u64(0)} } } } impl PartialEq for u64x2 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { vgetq_lane_u64(self.neon,0) == vgetq_lane_u64(other.neon,0) && vgetq_lane_u64(self.neon,1) == vgetq_lane_u64(other.neon,1) } } } impl Eq for u64x2 { } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct u64x2 { arr: [u64;2] } } } int_uint_consts!(u64, 2, u64x2, 128); unsafe impl Zeroable for u64x2 {} unsafe impl Pod for u64x2 {} impl Add for u64x2 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_i64_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u64x2_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_u64(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_add(rhs.arr[0]), self.arr[1].wrapping_add(rhs.arr[1]), ]} } } } } impl Sub for u64x2 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_i64_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u64x2_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vsubq_u64(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_sub(rhs.arr[0]), self.arr[1].wrapping_sub(rhs.arr[1]), ]} } } } } //we should try to implement this on sse2 impl Mul for u64x2 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="simd128")] { Self { simd: u64x2_mul(self.simd, rhs.simd) } } else { let arr1: [u64; 2] = cast(self); let arr2: [u64; 2] = cast(rhs); cast([ arr1[0].wrapping_mul(arr2[0]), arr1[1].wrapping_mul(arr2[1]), ]) } } } } impl Add for u64x2 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: u64) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for u64x2 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: u64) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for u64x2 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: u64) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for u64 { type Output = u64x2; #[inline] #[must_use] fn add(self, rhs: u64x2) -> Self::Output { u64x2::splat(self).add(rhs) } } impl Sub for u64 { type Output = u64x2; #[inline] #[must_use] fn sub(self, rhs: u64x2) -> Self::Output { u64x2::splat(self).sub(rhs) } } impl Mul for u64 { type Output = u64x2; #[inline] #[must_use] fn mul(self, rhs: u64x2) -> Self::Output { u64x2::splat(self).mul(rhs) } } impl BitAnd for u64x2 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitand_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vandq_u64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitand(rhs.arr[0]), self.arr[1].bitand(rhs.arr[1]), ]} } } } } impl BitOr for u64x2 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vorrq_u64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitor(rhs.arr[0]), self.arr[1].bitor(rhs.arr[1]), ]} } } } } impl BitXor for u64x2 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitxor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: veorq_u64(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitxor(rhs.arr[0]), self.arr[1].bitxor(rhs.arr[1]), ]} } } } } macro_rules! impl_shl_t_for_u64x2 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for u64x2 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shl_all_u64_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: u64x2_shl(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_u64(self.neon, vmovq_n_s64(rhs as i64)) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] << u, self.arr[1] << u, ]} } } } })+ }; } impl_shl_t_for_u64x2!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_u64x2 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for u64x2 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { let shift = cast([rhs as u64, 0]); Self { sse: shr_all_u64_m128i(self.sse, shift) } } else if #[cfg(target_feature="simd128")] { Self { simd: u64x2_shr(self.simd, rhs as u32) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vshlq_u64(self.neon, vmovq_n_s64(-(rhs as i64))) }} } else { let u = rhs as u64; Self { arr: [ self.arr[0] >> u, self.arr[1] >> u, ]} } } } })+ }; } impl_shr_t_for_u64x2!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); impl u64x2 { #[inline] #[must_use] pub const fn new(array: [u64; 2]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn cmp_eq(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: cmp_eq_mask_i64_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u64x2_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vceqq_u64(self.neon, rhs.neon) } } } else { let s: [u64;2] = cast(self); let r: [u64;2] = cast(rhs); cast([ if s[0] == r[0] { -1_i64 } else { 0 }, if s[1] == r[1] { -1_i64 } else { 0 }, ]) } } } #[inline] #[must_use] pub fn cmp_gt(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse4.2")] { // no unsigned gt so inverting the high bit will get the correct result let highbit = u64x2::splat(1 << 63); Self { sse: cmp_gt_mask_i64_m128i((self ^ highbit).sse, (rhs ^ highbit).sse) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vcgtq_u64(self.neon, rhs.neon) }} } else { // u64x2_gt on WASM is not a thing. https://github.com/WebAssembly/simd/pull/414 let s: [u64;2] = cast(self); let r: [u64;2] = cast(rhs); cast([ if s[0] > r[0] { u64::MAX } else { 0 }, if s[1] > r[1] { u64::MAX } else { 0 }, ]) } } } #[inline] #[must_use] pub fn cmp_lt(self, rhs: Self) -> Self { // lt is just gt the other way around rhs.cmp_gt(self) } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vbslq_u64(self.neon, t.neon, f.neon) }} } else { generic_bit_blend(self, t, f) } } } #[inline] pub fn to_array(self) -> [u64; 2] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[u64; 2] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [u64; 2] { cast_mut(self) } } wide-0.7.32/src/u64x4_.rs000066400000000000000000000155371473735473700147600ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="avx2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct u64x4 { pub(crate) avx2: m256i } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(32))] pub struct u64x4 { pub(crate) a : u64x2, pub(crate) b : u64x2 } } } int_uint_consts!(u64, 4, u64x4, 256); unsafe impl Zeroable for u64x4 {} unsafe impl Pod for u64x4 {} impl Add for u64x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: add_i64_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.add(rhs.a), b : self.b.add(rhs.b), } } } } } impl Sub for u64x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: sub_i64_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.sub(rhs.a), b : self.b.sub(rhs.b), } } } } } impl Mul for u64x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let arr1: [i64; 4] = cast(self); let arr2: [i64; 4] = cast(rhs); cast([ arr1[0].wrapping_mul(arr2[0]), arr1[1].wrapping_mul(arr2[1]), arr1[2].wrapping_mul(arr2[2]), arr1[3].wrapping_mul(arr2[3]), ]) } else { Self { a: self.a.mul(rhs.a), b: self.b.mul(rhs.b) } } } } } impl Add for u64x4 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: u64) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for u64x4 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: u64) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Mul for u64x4 { type Output = Self; #[inline] #[must_use] fn mul(self, rhs: u64) -> Self::Output { self.mul(Self::splat(rhs)) } } impl Add for u64 { type Output = u64x4; #[inline] #[must_use] fn add(self, rhs: u64x4) -> Self::Output { u64x4::splat(self).add(rhs) } } impl Sub for u64 { type Output = u64x4; #[inline] #[must_use] fn sub(self, rhs: u64x4) -> Self::Output { u64x4::splat(self).sub(rhs) } } impl Mul for u64 { type Output = u64x4; #[inline] #[must_use] fn mul(self, rhs: u64x4) -> Self::Output { u64x4::splat(self).mul(rhs) } } impl BitAnd for u64x4 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitand_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitand(rhs.a), b : self.b.bitand(rhs.b), } } } } } impl BitOr for u64x4 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitor(rhs.a), b : self.b.bitor(rhs.b), } } } } } impl BitXor for u64x4 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: bitxor_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.bitxor(rhs.a), b : self.b.bitxor(rhs.b), } } } } } macro_rules! impl_shl_t_for_u64x4 { ($($shift_type:ty),+ $(,)?) => { $(impl Shl<$shift_type> for u64x4 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shl(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shl_all_u64_m256i(self.avx2, shift) } } else { Self { a : self.a.shl(rhs), b : self.b.shl(rhs), } } } } })+ }; } impl_shl_t_for_u64x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); macro_rules! impl_shr_t_for_u64x4 { ($($shift_type:ty),+ $(,)?) => { $(impl Shr<$shift_type> for u64x4 { type Output = Self; /// Shifts all lanes by the value given. #[inline] #[must_use] fn shr(self, rhs: $shift_type) -> Self::Output { pick! { if #[cfg(target_feature="avx2")] { let shift = cast([rhs as u64, 0]); Self { avx2: shr_all_u64_m256i(self.avx2, shift) } } else { Self { a : self.a.shr(rhs), b : self.b.shr(rhs), } } } } })+ }; } impl_shr_t_for_u64x4!(i8, u8, i16, u16, i32, u32, i64, u64, i128, u128); impl u64x4 { #[inline] #[must_use] pub const fn new(array: [u64; 4]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn cmp_eq(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: cmp_eq_mask_i64_m256i(self.avx2, rhs.avx2) } } else { Self { a : self.a.cmp_eq(rhs.a), b : self.b.cmp_eq(rhs.b), } } } } #[inline] #[must_use] pub fn cmp_gt(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { // no unsigned gt than so inverting the high bit will get the correct result let highbit = u64x4::splat(1 << 63); Self { avx2: cmp_gt_mask_i64_m256i((self ^ highbit).avx2, (rhs ^ highbit).avx2) } } else { Self { a : self.a.cmp_gt(rhs.a), b : self.b.cmp_gt(rhs.b), } } } } #[inline] #[must_use] pub fn cmp_lt(self, rhs: Self) -> Self { // lt is just gt the other way around rhs.cmp_gt(self) } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: blend_varying_i8_m256i(f.avx2,t.avx2,self.avx2) } } else { Self { a : self.a.blend(t.a, f.a), b : self.b.blend(t.b, f.b), } } } } #[inline] pub fn to_array(self) -> [u64; 4] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[u64; 4] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [u64; 4] { cast_mut(self) } } impl Not for u64x4 { type Output = Self; #[inline] fn not(self) -> Self { pick! { if #[cfg(target_feature="avx2")] { Self { avx2: self.avx2.not() } } else { Self { a : self.a.not(), b : self.b.not(), } } } } } wide-0.7.32/src/u8x16_.rs000066400000000000000000000470321473735473700147540ustar00rootroot00000000000000use super::*; pick! { if #[cfg(target_feature="sse2")] { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct u8x16 { pub(crate) sse: m128i } } else if #[cfg(target_feature="simd128")] { use core::arch::wasm32::*; #[derive(Clone, Copy)] #[repr(transparent)] pub struct u8x16 { pub(crate) simd: v128 } impl Default for u8x16 { fn default() -> Self { Self::splat(0) } } impl PartialEq for u8x16 { fn eq(&self, other: &Self) -> bool { u8x16_all_true(u8x16_eq(self.simd, other.simd)) } } impl Eq for u8x16 { } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ use core::arch::aarch64::*; #[repr(C)] #[derive(Copy, Clone)] pub struct u8x16 { pub(crate) neon : uint8x16_t } impl Default for u8x16 { #[inline] #[must_use] fn default() -> Self { Self::splat(0) } } impl PartialEq for u8x16 { #[inline] #[must_use] fn eq(&self, other: &Self) -> bool { unsafe { vminvq_u8(vceqq_u8(self.neon, other.neon))==u8::MAX } } } impl Eq for u8x16 { } } else { #[derive(Default, Clone, Copy, PartialEq, Eq)] #[repr(C, align(16))] pub struct u8x16 { pub(crate) arr: [u8;16] } } } int_uint_consts!(u8, 16, u8x16, 128); unsafe impl Zeroable for u8x16 {} unsafe impl Pod for u8x16 {} impl Add for u8x16 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u8x16_add(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vaddq_u8(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].wrapping_add(rhs.arr[0]), self.arr[1].wrapping_add(rhs.arr[1]), self.arr[2].wrapping_add(rhs.arr[2]), self.arr[3].wrapping_add(rhs.arr[3]), self.arr[4].wrapping_add(rhs.arr[4]), self.arr[5].wrapping_add(rhs.arr[5]), self.arr[6].wrapping_add(rhs.arr[6]), self.arr[7].wrapping_add(rhs.arr[7]), self.arr[8].wrapping_add(rhs.arr[8]), self.arr[9].wrapping_add(rhs.arr[9]), self.arr[10].wrapping_add(rhs.arr[10]), self.arr[11].wrapping_add(rhs.arr[11]), self.arr[12].wrapping_add(rhs.arr[12]), self.arr[13].wrapping_add(rhs.arr[13]), self.arr[14].wrapping_add(rhs.arr[14]), self.arr[15].wrapping_add(rhs.arr[15]), ]} } } } } impl Sub for u8x16 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u8x16_sub(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vsubq_u8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].wrapping_sub(rhs.arr[0]), self.arr[1].wrapping_sub(rhs.arr[1]), self.arr[2].wrapping_sub(rhs.arr[2]), self.arr[3].wrapping_sub(rhs.arr[3]), self.arr[4].wrapping_sub(rhs.arr[4]), self.arr[5].wrapping_sub(rhs.arr[5]), self.arr[6].wrapping_sub(rhs.arr[6]), self.arr[7].wrapping_sub(rhs.arr[7]), self.arr[8].wrapping_sub(rhs.arr[8]), self.arr[9].wrapping_sub(rhs.arr[9]), self.arr[10].wrapping_sub(rhs.arr[10]), self.arr[11].wrapping_sub(rhs.arr[11]), self.arr[12].wrapping_sub(rhs.arr[12]), self.arr[13].wrapping_sub(rhs.arr[13]), self.arr[14].wrapping_sub(rhs.arr[14]), self.arr[15].wrapping_sub(rhs.arr[15]), ]} } } } } impl Add for u8x16 { type Output = Self; #[inline] #[must_use] fn add(self, rhs: u8) -> Self::Output { self.add(Self::splat(rhs)) } } impl Sub for u8x16 { type Output = Self; #[inline] #[must_use] fn sub(self, rhs: u8) -> Self::Output { self.sub(Self::splat(rhs)) } } impl Add for u8 { type Output = u8x16; #[inline] #[must_use] fn add(self, rhs: u8x16) -> Self::Output { u8x16::splat(self).add(rhs) } } impl Sub for u8 { type Output = u8x16; #[inline] #[must_use] fn sub(self, rhs: u8x16) -> Self::Output { u8x16::splat(self).sub(rhs) } } impl BitAnd for u8x16 { type Output = Self; #[inline] #[must_use] fn bitand(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitand_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_and(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vandq_u8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitand(rhs.arr[0]), self.arr[1].bitand(rhs.arr[1]), self.arr[2].bitand(rhs.arr[2]), self.arr[3].bitand(rhs.arr[3]), self.arr[4].bitand(rhs.arr[4]), self.arr[5].bitand(rhs.arr[5]), self.arr[6].bitand(rhs.arr[6]), self.arr[7].bitand(rhs.arr[7]), self.arr[8].bitand(rhs.arr[8]), self.arr[9].bitand(rhs.arr[9]), self.arr[10].bitand(rhs.arr[10]), self.arr[11].bitand(rhs.arr[11]), self.arr[12].bitand(rhs.arr[12]), self.arr[13].bitand(rhs.arr[13]), self.arr[14].bitand(rhs.arr[14]), self.arr[15].bitand(rhs.arr[15]), ]} } } } } impl BitOr for u8x16 { type Output = Self; #[inline] #[must_use] fn bitor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_or(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vorrq_u8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitor(rhs.arr[0]), self.arr[1].bitor(rhs.arr[1]), self.arr[2].bitor(rhs.arr[2]), self.arr[3].bitor(rhs.arr[3]), self.arr[4].bitor(rhs.arr[4]), self.arr[5].bitor(rhs.arr[5]), self.arr[6].bitor(rhs.arr[6]), self.arr[7].bitor(rhs.arr[7]), self.arr[8].bitor(rhs.arr[8]), self.arr[9].bitor(rhs.arr[9]), self.arr[10].bitor(rhs.arr[10]), self.arr[11].bitor(rhs.arr[11]), self.arr[12].bitor(rhs.arr[12]), self.arr[13].bitor(rhs.arr[13]), self.arr[14].bitor(rhs.arr[14]), self.arr[15].bitor(rhs.arr[15]), ]} } } } } impl BitXor for u8x16 { type Output = Self; #[inline] #[must_use] fn bitxor(self, rhs: Self) -> Self::Output { pick! { if #[cfg(target_feature="sse2")] { Self { sse: bitxor_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_xor(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: veorq_u8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].bitxor(rhs.arr[0]), self.arr[1].bitxor(rhs.arr[1]), self.arr[2].bitxor(rhs.arr[2]), self.arr[3].bitxor(rhs.arr[3]), self.arr[4].bitxor(rhs.arr[4]), self.arr[5].bitxor(rhs.arr[5]), self.arr[6].bitxor(rhs.arr[6]), self.arr[7].bitxor(rhs.arr[7]), self.arr[8].bitxor(rhs.arr[8]), self.arr[9].bitxor(rhs.arr[9]), self.arr[10].bitxor(rhs.arr[10]), self.arr[11].bitxor(rhs.arr[11]), self.arr[12].bitxor(rhs.arr[12]), self.arr[13].bitxor(rhs.arr[13]), self.arr[14].bitxor(rhs.arr[14]), self.arr[15].bitxor(rhs.arr[15]), ]} } } } } impl u8x16 { #[inline] #[must_use] pub const fn new(array: [u8; 16]) -> Self { unsafe { core::intrinsics::transmute(array) } } #[inline] #[must_use] pub fn cmp_eq(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: cmp_eq_mask_i8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u8x16_eq(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vceqq_u8(self.neon, rhs.neon) }} } else { Self { arr: [ if self.arr[0] == rhs.arr[0] { u8::MAX } else { 0 }, if self.arr[1] == rhs.arr[1] { u8::MAX } else { 0 }, if self.arr[2] == rhs.arr[2] { u8::MAX } else { 0 }, if self.arr[3] == rhs.arr[3] { u8::MAX } else { 0 }, if self.arr[4] == rhs.arr[4] { u8::MAX } else { 0 }, if self.arr[5] == rhs.arr[5] { u8::MAX } else { 0 }, if self.arr[6] == rhs.arr[6] { u8::MAX } else { 0 }, if self.arr[7] == rhs.arr[7] { u8::MAX } else { 0 }, if self.arr[8] == rhs.arr[8] { u8::MAX } else { 0 }, if self.arr[9] == rhs.arr[9] { u8::MAX } else { 0 }, if self.arr[10] == rhs.arr[10] { u8::MAX } else { 0 }, if self.arr[11] == rhs.arr[11] { u8::MAX } else { 0 }, if self.arr[12] == rhs.arr[12] { u8::MAX } else { 0 }, if self.arr[13] == rhs.arr[13] { u8::MAX } else { 0 }, if self.arr[14] == rhs.arr[14] { u8::MAX } else { 0 }, if self.arr[15] == rhs.arr[15] { u8::MAX } else { 0 }, ]} } } } #[inline] #[must_use] pub fn blend(self, t: Self, f: Self) -> Self { pick! { if #[cfg(target_feature="sse4.1")] { Self { sse: blend_varying_i8_m128i(f.sse, t.sse, self.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: v128_bitselect(t.simd, f.simd, self.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vbslq_u8(self.neon, t.neon, f.neon) }} } else { generic_bit_blend(self, t, f) } } } #[inline] #[must_use] pub fn max(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: max_u8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u8x16_max(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vmaxq_u8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].max(rhs.arr[0]), self.arr[1].max(rhs.arr[1]), self.arr[2].max(rhs.arr[2]), self.arr[3].max(rhs.arr[3]), self.arr[4].max(rhs.arr[4]), self.arr[5].max(rhs.arr[5]), self.arr[6].max(rhs.arr[6]), self.arr[7].max(rhs.arr[7]), self.arr[8].max(rhs.arr[8]), self.arr[9].max(rhs.arr[9]), self.arr[10].max(rhs.arr[10]), self.arr[11].max(rhs.arr[11]), self.arr[12].max(rhs.arr[12]), self.arr[13].max(rhs.arr[13]), self.arr[14].max(rhs.arr[14]), self.arr[15].max(rhs.arr[15]), ]} } } } #[inline] #[must_use] pub fn min(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: min_u8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u8x16_min(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vminq_u8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].min(rhs.arr[0]), self.arr[1].min(rhs.arr[1]), self.arr[2].min(rhs.arr[2]), self.arr[3].min(rhs.arr[3]), self.arr[4].min(rhs.arr[4]), self.arr[5].min(rhs.arr[5]), self.arr[6].min(rhs.arr[6]), self.arr[7].min(rhs.arr[7]), self.arr[8].min(rhs.arr[8]), self.arr[9].min(rhs.arr[9]), self.arr[10].min(rhs.arr[10]), self.arr[11].min(rhs.arr[11]), self.arr[12].min(rhs.arr[12]), self.arr[13].min(rhs.arr[13]), self.arr[14].min(rhs.arr[14]), self.arr[15].min(rhs.arr[15]), ]} } } } #[inline] #[must_use] pub fn saturating_add(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: add_saturating_u8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u8x16_add_sat(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe {Self { neon: vqaddq_u8(self.neon, rhs.neon) }} } else { Self { arr: [ self.arr[0].saturating_add(rhs.arr[0]), self.arr[1].saturating_add(rhs.arr[1]), self.arr[2].saturating_add(rhs.arr[2]), self.arr[3].saturating_add(rhs.arr[3]), self.arr[4].saturating_add(rhs.arr[4]), self.arr[5].saturating_add(rhs.arr[5]), self.arr[6].saturating_add(rhs.arr[6]), self.arr[7].saturating_add(rhs.arr[7]), self.arr[8].saturating_add(rhs.arr[8]), self.arr[9].saturating_add(rhs.arr[9]), self.arr[10].saturating_add(rhs.arr[10]), self.arr[11].saturating_add(rhs.arr[11]), self.arr[12].saturating_add(rhs.arr[12]), self.arr[13].saturating_add(rhs.arr[13]), self.arr[14].saturating_add(rhs.arr[14]), self.arr[15].saturating_add(rhs.arr[15]), ]} } } } #[inline] #[must_use] pub fn saturating_sub(self, rhs: Self) -> Self { pick! { if #[cfg(target_feature="sse2")] { Self { sse: sub_saturating_u8_m128i(self.sse, rhs.sse) } } else if #[cfg(target_feature="simd128")] { Self { simd: u8x16_sub_sat(self.simd, rhs.simd) } } else if #[cfg(all(target_feature="neon",target_arch="aarch64"))]{ unsafe { Self { neon: vqsubq_u8(self.neon, rhs.neon) } } } else { Self { arr: [ self.arr[0].saturating_sub(rhs.arr[0]), self.arr[1].saturating_sub(rhs.arr[1]), self.arr[2].saturating_sub(rhs.arr[2]), self.arr[3].saturating_sub(rhs.arr[3]), self.arr[4].saturating_sub(rhs.arr[4]), self.arr[5].saturating_sub(rhs.arr[5]), self.arr[6].saturating_sub(rhs.arr[6]), self.arr[7].saturating_sub(rhs.arr[7]), self.arr[8].saturating_sub(rhs.arr[8]), self.arr[9].saturating_sub(rhs.arr[9]), self.arr[10].saturating_sub(rhs.arr[10]), self.arr[11].saturating_sub(rhs.arr[11]), self.arr[12].saturating_sub(rhs.arr[12]), self.arr[13].saturating_sub(rhs.arr[13]), self.arr[14].saturating_sub(rhs.arr[14]), self.arr[15].saturating_sub(rhs.arr[15]), ]} } } } /// Unpack and interleave low lanes of two `u8x16` #[inline] #[must_use] pub fn unpack_low(lhs: u8x16, rhs: u8x16) -> u8x16 { pick! { if #[cfg(target_feature = "sse2")] { u8x16 { sse: unpack_low_i8_m128i(lhs.sse, rhs.sse) } } else if #[cfg(target_feature = "simd128")] { u8x16 { simd: u8x16_shuffle::<0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23>(lhs.simd, rhs.simd) } } else if #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] { let lhs = unsafe { vget_low_u8(lhs.neon) }; let rhs = unsafe { vget_low_u8(rhs.neon) }; let zipped = unsafe { vzip_u8(lhs, rhs) }; u8x16 { neon: unsafe { vcombine_u8(zipped.0, zipped.1) } } } else { u8x16::new([ lhs.as_array_ref()[0], rhs.as_array_ref()[0], lhs.as_array_ref()[1], rhs.as_array_ref()[1], lhs.as_array_ref()[2], rhs.as_array_ref()[2], lhs.as_array_ref()[3], rhs.as_array_ref()[3], lhs.as_array_ref()[4], rhs.as_array_ref()[4], lhs.as_array_ref()[5], rhs.as_array_ref()[5], lhs.as_array_ref()[6], rhs.as_array_ref()[6], lhs.as_array_ref()[7], rhs.as_array_ref()[7], ]) } } } /// Unpack and interleave high lanes of two `u8x16` #[inline] #[must_use] pub fn unpack_high(lhs: u8x16, rhs: u8x16) -> u8x16 { pick! { if #[cfg(target_feature = "sse2")] { u8x16 { sse: unpack_high_i8_m128i(lhs.sse, rhs.sse) } } else if #[cfg(target_feature = "simd128")] { u8x16 { simd: u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(lhs.simd, rhs.simd) } } else if #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] { let lhs = unsafe { vget_high_u8(lhs.neon) }; let rhs = unsafe { vget_high_u8(rhs.neon) }; let zipped = unsafe { vzip_u8(lhs, rhs) }; u8x16 { neon: unsafe { vcombine_u8(zipped.0, zipped.1) } } } else { u8x16::new([ lhs.as_array_ref()[8], rhs.as_array_ref()[8], lhs.as_array_ref()[9], rhs.as_array_ref()[9], lhs.as_array_ref()[10], rhs.as_array_ref()[10], lhs.as_array_ref()[11], rhs.as_array_ref()[11], lhs.as_array_ref()[12], rhs.as_array_ref()[12], lhs.as_array_ref()[13], rhs.as_array_ref()[13], lhs.as_array_ref()[14], rhs.as_array_ref()[14], lhs.as_array_ref()[15], rhs.as_array_ref()[15], ]) } } } /// Pack and saturate two `i16x8` to `u8x16` #[inline] #[must_use] pub fn narrow_i16x8(lhs: i16x8, rhs: i16x8) -> Self { pick! { if #[cfg(target_feature = "sse2")] { u8x16 { sse: pack_i16_to_u8_m128i(lhs.sse, rhs.sse) } } else if #[cfg(target_feature = "simd128")] { u8x16 { simd: u8x16_narrow_i16x8(lhs.simd, rhs.simd) } } else if #[cfg(all(target_feature = "neon", target_arch = "aarch64"))] { let lhs = unsafe { vqmovun_s16(lhs.neon) }; let rhs = unsafe { vqmovun_s16(rhs.neon) }; u8x16 { neon: unsafe { vcombine_u8(lhs, rhs) } } } else { fn clamp(a: i16) -> u8 { if a < u8::MIN as i16 { u8::MIN } else if a > u8::MAX as i16 { u8::MAX } else { a as u8 } } Self { arr: [ clamp(lhs.as_array_ref()[0]), clamp(lhs.as_array_ref()[1]), clamp(lhs.as_array_ref()[2]), clamp(lhs.as_array_ref()[3]), clamp(lhs.as_array_ref()[4]), clamp(lhs.as_array_ref()[5]), clamp(lhs.as_array_ref()[6]), clamp(lhs.as_array_ref()[7]), clamp(rhs.as_array_ref()[0]), clamp(rhs.as_array_ref()[1]), clamp(rhs.as_array_ref()[2]), clamp(rhs.as_array_ref()[3]), clamp(rhs.as_array_ref()[4]), clamp(rhs.as_array_ref()[5]), clamp(rhs.as_array_ref()[6]), clamp(rhs.as_array_ref()[7]), ]} } } } #[inline] pub fn to_array(self) -> [u8; 16] { cast(self) } #[inline] pub fn as_array_ref(&self) -> &[u8; 16] { cast_ref(self) } #[inline] pub fn as_array_mut(&mut self) -> &mut [u8; 16] { cast_mut(self) } } wide-0.7.32/tests/000077500000000000000000000000001473735473700137215ustar00rootroot00000000000000wide-0.7.32/tests/all_tests/000077500000000000000000000000001473735473700157135ustar00rootroot00000000000000wide-0.7.32/tests/all_tests/main.rs000066400000000000000000000137761473735473700172230ustar00rootroot00000000000000#![allow(clippy::approx_constant)] #![allow(clippy::unnecessary_cast)] #![allow(clippy::assertions_on_constants)] #![allow(clippy::needless_range_loop)] #![allow(clippy::nonminimal_bool)] mod t_f32x4; mod t_f32x8; mod t_f64x2; mod t_f64x4; mod t_i16x16; mod t_i16x8; mod t_i32x4; mod t_i32x8; mod t_i64x2; mod t_i64x4; mod t_i8x16; mod t_i8x32; mod t_u16x16; mod t_u16x8; mod t_u32x4; mod t_u32x8; mod t_u64x2; mod t_u64x4; mod t_u8x16; mod t_usefulness; /// Generates the next pseudo-random number. /// Definitely non-cryptographic, just used for generating random test values. fn next_rand_u64(state: &mut u64) -> u64 { // Constants for the LCG const A: u64 = 6364136223846793005; const C: u64 = 1442695040888963407; // Update the state and calculate the next number (rotate to avoid lack of // randomness in low bits) *state = state.wrapping_mul(A).wrapping_add(C).rotate_left(31); *state } const RNG_SEED: u64 = 0x123456789abcdef0; /// Generate a pseudo-random value for a type that implements GenSample. fn gen_random(rng: &mut u64) -> T { let r = next_rand_u64(rng); // generate special values more often than random chance to test edge cases let next = match r & 0xf { 0 => 0, 1 => 1, 2 => u64::MAX, _ => next_rand_u64(rng), }; T::get_sample(next) } /// Test a vector operation against a pure scalar implementation for random /// values to make sure that the behavior is the same. This allows for easier /// for correctness for various values of the vector. fn test_random_vector_vs_scalar< V, VR, T, TR, FnVec: Fn(V, V) -> VR, FnScalar: Fn(T, T) -> TR, const N: usize, >( vector_fn: FnVec, scalar_fn: FnScalar, ) where V: Copy + From<[T; N]>, T: Copy + Default + std::fmt::Debug + GenSample, TR: Copy + PartialEq + std::fmt::Debug + Default + GenSample, VR: Copy + Into<[TR; N]>, { let mut a_arr = [T::default(); N]; let mut b_arr: [T; N] = [T::default(); N]; // use a fixed seed for reproducibility let mut rng = RNG_SEED; // do 100 iterations for _i in 0..100 { for i in 0..N { a_arr[i] = gen_random(&mut rng); b_arr[i] = gen_random(&mut rng); } let mut expected_arr: [TR; N] = [TR::default(); N]; for i in 0..N { expected_arr[i] = scalar_fn(a_arr[i], b_arr[i]); } let expected_vec_arr: [TR; N] = vector_fn(V::from(a_arr), V::from(b_arr)).into(); for i in 0..N { assert!( expected_arr[i].binary_eq(expected_vec_arr[i]), "scalar = {:?}\nvec = {:?}\na = {:?}\nb = {:?}", expected_arr, expected_vec_arr, a_arr, b_arr ); } } } /// Test a vector reduce operations that generate a scalar from a vector /// against a pure scalar implementation for random values to make /// sure that the behavior is the same. This allows for easier for correctness /// for various values of the vector. /// /// The scalar operation uses the same construction as the Rust fold function /// which takes an accumulator and returns the accumulator after applying the /// operation. fn test_random_vector_vs_scalar_reduce< V, T, TR, FnVec: Fn(V) -> TR, FnScalar: Fn(TR, T, usize) -> TR, const N: usize, >( vector_fn: FnVec, acc: TR, scalar_fn: FnScalar, ) where V: From<[T; N]> + Into<[T; N]> + Copy + std::fmt::Debug, T: Copy + PartialEq + std::fmt::Debug + Default + GenSample, TR: Copy + PartialEq + std::fmt::Debug + Default, { let mut a_arr = [T::default(); N]; // use a fixed seed for reproducibility let mut rng = RNG_SEED; // do 100 iterations for _i in 0..100 { for i in 0..N { a_arr[i] = gen_random(&mut rng); } let mut expected_scalar = acc; for i in 0..N { expected_scalar = scalar_fn(expected_scalar, a_arr[i], i); } let expected_vec = vector_fn(V::from(a_arr)); assert_eq!( expected_scalar, expected_vec, "scalar = {:?} vec = {:?} source = {:?}", expected_scalar, expected_vec, a_arr ); } } /// trait to reduce a 64 bit pseudo-random number to a random sample value trait GenSample where Self: PartialEq + Copy, { fn get_sample(v: u64) -> Self; fn binary_eq(self, b: Self) -> bool { self == b } } impl GenSample for u64 { fn get_sample(v: u64) -> Self { v } } impl GenSample for u32 { fn get_sample(v: u64) -> Self { v as u32 } } impl GenSample for u16 { fn get_sample(v: u64) -> Self { v as u16 } } impl GenSample for u8 { fn get_sample(v: u64) -> Self { v as u8 } } impl GenSample for i64 { fn get_sample(v: u64) -> Self { v as i64 } } impl GenSample for i32 { fn get_sample(v: u64) -> Self { v as i32 } } impl GenSample for i16 { fn get_sample(v: u64) -> Self { v as i16 } } impl GenSample for i8 { fn get_sample(v: u64) -> Self { v as i8 } } impl GenSample for f32 { fn get_sample(v: u64) -> Self { // generate special float values more often than random // chance to test edge cases let m = (v >> 8) & 15; match m { 1 => f32::NAN, 2 => f32::INFINITY, 3 => f32::NEG_INFINITY, _ => ((v as i64) as f32) / 7.0, } } /// floating points Nan always fails equality so we need to special case it fn binary_eq(self, b: Self) -> bool { if self.is_nan() { b.is_nan() } else if self.is_infinite() { b.is_infinite() && self.is_sign_positive() == b.is_sign_positive() } else { (self - b).abs() < 0.000001 } } } impl GenSample for f64 { // generate special float values more often than random // chance to test edge cases fn get_sample(v: u64) -> Self { let m = (v >> 8) & 15; match m { 1 => f64::NAN, 2 => f64::INFINITY, 3 => f64::NEG_INFINITY, _ => ((v as i64) as f64) / 7.0, } } /// floating points Nan always fails equality so we need to special case it fn binary_eq(self, b: Self) -> bool { if self.is_nan() { b.is_nan() } else if self.is_infinite() { b.is_infinite() && self.is_sign_positive() == b.is_sign_positive() } else { (self - b).abs() < 0.000001 } } } wide-0.7.32/tests/all_tests/t_f32x4.rs000066400000000000000000000554561473735473700174710ustar00rootroot00000000000000use wide::*; use bytemuck::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_debug_for_f32x4() { let expected = "(1.0, 2.0, 3.0, 4.0)"; let actual = format!("{:?}", f32x4::from([1.0, 2.0, 3.0, 4.0])); assert_eq!(expected, actual); let expected = "(1.000, 2.000, 3.000, 4.000)"; let actual = format!("{:.3?}", f32x4::from([1.0, 2.0, 3.0, 4.0])); assert_eq!(expected, actual); } #[test] fn impl_add_for_f32x4() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let b = f32x4::from([5.0, 6.0, 7.0, 8.0]); let expected = f32x4::from([6.0, 8.0, 10.0, 12.0]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_add_const_for_f32x4() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let expected = f32x4::from([6.0, 7.0, 8.0, 9.0]); let actual = a + 5.0; assert_eq!(expected, actual); } #[test] fn impl_sub_const_for_f32x4() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let expected = f32x4::from([-1.0, 0.0, 1.0, 2.0]); let actual = a - 2.0; assert_eq!(expected, actual); } #[test] fn impl_mul_const_for_f32x4() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let expected = f32x4::from([2.0, 4.0, 6.0, 8.0]); let actual = a * 2.0; assert_eq!(expected, actual); } #[test] fn impl_div_const_for_f32x4() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let expected = f32x4::from([0.5, 1.0, 1.5, 2.0]); let actual = a / 2.0; assert_eq!(expected, actual); } #[test] fn impl_sub_for_f32x4() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let b = f32x4::from([5.0, 7.0, 17.0, 1.0]); let expected = f32x4::from([-4.0, -5.0, -14.0, 3.0]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_f32x4() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let b = f32x4::from([5.0, 7.0, 17.0, 1.0]); let expected = f32x4::from([5.0, 14.0, 51.0, 4.0]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_div_for_f32x4() { let a = f32x4::from([4.0, 9.0, 10.0, 12.0]); let b = f32x4::from([2.0, 2.0, 5.0, -3.0]); let expected = f32x4::from([2.0, 4.5, 2.0, -4.0]); let actual = a / b; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_f32x4() { let a = f32x4::from([0.0, 0.0, 1.0, 1.0]); let b = f32x4::from([0.0, 1.0, 0.0, 1.0]); let expected = f32x4::from([0.0, 0.0, 0.0, 1.0]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_f32x4() { let a = f32x4::from([0.0, 0.0, 1.0, 1.0]); let b = f32x4::from([0.0, 1.0, 0.0, 1.0]); let expected = f32x4::from([0.0, 1.0, 1.0, 1.0]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_f32x4() { let a = f32x4::from([0.0, 0.0, 1.0, 1.0]); let b = f32x4::from([0.0, 1.0, 0.0, 1.0]); let expected = f32x4::from([0.0, 1.0, 1.0, 0.0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_f32x4_cmp_eq() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let b = f32x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i32; 4] = [0, -1, 0, 0]; let actual: [i32; 4] = cast(a.cmp_eq(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x4_cmp_ne() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let b = f32x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i32; 4] = [-1, 0, -1, -1]; let actual: [i32; 4] = cast(a.cmp_ne(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x4_cmp_ge() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let b = f32x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i32; 4] = [0, -1, -1, -1]; let actual: [i32; 4] = cast(a.cmp_ge(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x4_cmp_gt() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let b = f32x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i32; 4] = [0, 0, -1, -1]; let actual: [i32; 4] = cast(a.cmp_gt(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x4_cmp_le() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let b = f32x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i32; 4] = [-1, -1, 0, 0]; let actual: [i32; 4] = cast(a.cmp_le(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x4_cmp_lt() { let a = f32x4::from([1.0, 2.0, 3.0, 4.0]); let b = f32x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i32; 4] = [-1, 0, 0, 0]; let actual: [i32; 4] = cast(a.cmp_lt(b)); assert_eq!(expected, actual); let expected: [i32; 4] = [0, 0, 0, 0]; let actual: [i32; 4] = cast(a.cmp_lt(a)); assert_eq!(expected, actual); } #[test] fn impl_f32x4_blend() { let use_t: f32 = f32::from_bits(u32::MAX); let t = f32x4::from([1.0, 2.0, 3.0, 4.0]); let f = f32x4::from([5.0, 6.0, 7.0, 8.0]); let mask = f32x4::from([use_t, 0.0, use_t, 0.0]); let expected = f32x4::from([1.0, 6.0, 3.0, 8.0]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_f32x4_abs() { let a = f32x4::from([-1.0, 2.0, -3.5, f32::NEG_INFINITY]); let expected = f32x4::from([1.0, 2.0, 3.5, f32::INFINITY]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_f32x4_floor() { let a = f32x4::from([-1.1, 60.9, 1.1, f32::INFINITY]); let expected = f32x4::from([-2.0, 60.0, 1.0, f32::INFINITY]); let actual = a.floor(); assert_eq!(expected, actual); } #[test] fn impl_f32x4_ceil() { let a = f32x4::from([-1.1, 60.9, 1.1, f32::NEG_INFINITY]); let expected = f32x4::from([-1.0, 61.0, 2.0, f32::NEG_INFINITY]); let actual = a.ceil(); assert_eq!(expected, actual); } #[test] fn impl_f32x4_fast_max() { let a = f32x4::from([1.0, 5.0, 3.0, -4.0]); let b = f32x4::from([2.0, 3.0, -5.0, -10.0]); let expected = f32x4::from([2.0, 5.0, 3.0, -4.0]); let actual = a.fast_max(b); assert_eq!(expected, actual); } #[test] fn impl_f32x4_max() { let a = f32x4::from([1.0, 5.0, -3.0, -0.0]); let b = f32x4::from([2.0, -5.0, -1.0, 0.0]); let expected = f32x4::from([2.0, 5.0, -1.0, 0.0]); let actual = a.max(b); assert_eq!(expected, actual); let a = f32x4::from([1.0, 5.0, 3.0, f32::NAN]); let b = f32x4::from([f32::NAN, f32::NEG_INFINITY, f32::INFINITY, 10.0]); let expected = f32x4::from([1.0, 5.0, f32::INFINITY, 10.0]); let actual = a.max(b); assert_eq!(expected, actual); } #[test] fn impl_f32x4_fast_min() { let a = f32x4::from([1.0, 5.0, 3.0, -4.0]); let b = f32x4::from([2.0, 3.0, -5.0, -10.0]); let expected = f32x4::from([1.0, 3.0, -5.0, -10.0]); let actual = a.fast_min(b); assert_eq!(expected, actual); } #[test] fn impl_f32x4_min() { let a = f32x4::from([1.0, 5.0, -3.0, -0.0]); let b = f32x4::from([2.0, -5.0, -1.0, 0.0]); let expected = f32x4::from([1.0, -5.0, -3.0, -0.0]); let actual = a.min(b); assert_eq!(expected, actual); let a = f32x4::from([1.0, 5.0, 3.0, f32::NAN]); let b = f32x4::from([f32::NAN, f32::NEG_INFINITY, f32::INFINITY, 10.0]); let expected = f32x4::from([1.0, f32::NEG_INFINITY, 3.0, 10.0]); let actual = a.min(b); assert_eq!(expected, actual); } #[test] fn impl_f32x4_is_nan() { let a = f32x4::from([0.0, f32::NAN, f32::NAN, 0.0]); let expected = [0, u32::MAX, u32::MAX, 0]; let actual: [u32; 4] = cast(a.is_nan()); assert_eq!(expected, actual); } #[test] fn impl_f32x4_is_finite() { let a = f32x4::from([f32::NAN, 1.0, f32::INFINITY, f32::NEG_INFINITY]); let expected = [0, u32::MAX, 0, 0]; let actual: [u32; 4] = cast(a.is_finite()); assert_eq!(expected, actual); } #[test] fn impl_f32x4_is_inf() { let a = f32x4::from([f32::NAN, 1.0, f32::INFINITY, f32::NEG_INFINITY]); let expected = [0, 0, u32::MAX, u32::MAX]; let actual: [u32; 4] = cast(a.is_inf()); assert_eq!(expected, actual); } #[test] fn impl_f32x4_round() { let a = f32x4::from([1.1, 2.5, 3.7, 4.0]); let expected = f32x4::from([1.0, 2.0, 4.0, 4.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f32x4::from([-1.1, -2.5, -3.7, -4.0]); let expected = f32x4::from([-1.0, -2.0, -4.0, -4.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f32x4::from([f32::INFINITY, f32::NEG_INFINITY, 5.5, 5.0]); let expected = f32x4::from([f32::INFINITY, f32::NEG_INFINITY, 6.0, 5.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f32x4::from(f32::NAN); let expected: [u32; 4] = [u32::MAX; 4]; let actual: [u32; 4] = cast(a.round().is_nan()); assert_eq!(expected, actual); // let a = f32x4::from(-0.0); let expected = a; let actual = a.round(); assert_eq!(expected, actual); } #[test] fn impl_f32x4_fast_round_int() { for (f, i) in [(1.0, 1), (1.1, 1), (-2.1, -2), (2.5, 2), (0.0, 0), (-0.0, 0)] .iter() .copied() { let a = f32x4::from(f); let expected = i32x4::from(i); let actual = a.fast_round_int(); assert_eq!(expected, actual); } } #[test] fn impl_f32x4_round_int() { for (f, i) in [ (1.0, 1), (1.1, 1), (-2.1, -2), (2.5, 2), (0.0, 0), (-0.0, 0), (f32::NAN, 0), (f32::INFINITY, i32::MAX), (f32::NEG_INFINITY, i32::MIN), ] .iter() .copied() { let a = f32x4::from(f); let expected = i32x4::from(i); let actual = a.round_int(); assert_eq!(expected, actual); } } #[test] fn impl_f32x4_fast_trunc_int() { let a = f32x4::from([1.1, 2.5, 3.7, 4.0]); let expected = i32x4::from([1, 2, 3, 4]); let actual = a.fast_trunc_int(); assert_eq!(expected, actual); // let a = f32x4::from([-1.1, -2.5, -3.7, -4.0]); let expected = i32x4::from([-1, -2, -3, -4]); let actual = a.fast_trunc_int(); assert_eq!(expected, actual); } #[test] fn impl_f32x4_trunc_int() { let a = f32x4::from([1.1, 2.5, 3.7, 4.0]); let expected = i32x4::from([1, 2, 3, 4]); let actual = a.trunc_int(); assert_eq!(expected, actual); // let a = f32x4::from([-1.1, -2.5, -3.7, -4.0]); let expected = i32x4::from([-1, -2, -3, -4]); let actual = a.trunc_int(); assert_eq!(expected, actual); // let a = f32x4::from([f32::NEG_INFINITY, f32::INFINITY, f32::NAN, 0.0]); let expected = i32x4::from([i32::MIN, i32::MAX, 0, 0]); let actual = a.trunc_int(); assert_eq!(expected, actual); } #[test] fn impl_f32x4_mul_add() { let a = f32x4::from([2.0, 3.0, 4.0, 5.0]); let b = f32x4::from([4.0, 5.0, 6.0, 7.0]); let c = f32x4::from([1.0, 1.0, 1.0, 1.0]); let expected = f32x4::from([9.0, 16.0, 25.0, 36.0]); let actual = a.mul_add(b, c); assert_eq!(expected, actual); } #[test] fn impl_f32x4_mul_neg_add() { let a = f32x4::from([2.0, 3.0, 4.0, 5.0]); let b = f32x4::from([4.0, 5.0, 6.0, 7.0]); let c = f32x4::from([1.0, 1.0, 1.0, 1.0]); let expected = f32x4::from([-7.0, -14.0, -23.0, -34.0]); let actual = a.mul_neg_add(b, c); assert_eq!(expected, actual); } #[test] fn impl_f32x4_flip_signs() { let a = f32x4::from([1.0, 1.0, -1.0, -1.0]); let b = f32x4::from([2.0, -3.0, 4.0, -5.0]); let expected = f32x4::from([1.0, -1.0, -1.0, 1.0]); let actual = a.flip_signs(b); assert_eq!(expected, actual); } #[test] fn impl_f32x4_copysign() { let a = f32x4::from([1.0, 1.0, -1.0, -1.0]); let b = f32x4::from([2.0, -3.0, 4.0, -5.0]); let expected = f32x4::from([1.0, -1.0, 1.0, -1.0]); let actual = a.copysign(b); assert_eq!(expected, actual); } #[test] fn impl_f32x4_sin_cos() { for x in -2500..=2500 { let base = (x * 4) as f32; let angles = [base, base + 1.0, base + 2.0, base + 3.0]; let (actual_sins, actual_coses) = f32x4::from(angles).sin_cos(); for i in 0..4 { let angle = angles[i]; let check = |name: &str, vals: f32x4, expected: f32| { let actual_arr: [f32; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000002, "Wanted {name}({angle}) to be {expected} but got {actual}", name = name, angle = angle, expected = expected, actual = actual ); }; check("sin", actual_sins, angle.sin()); check("cos", actual_coses, angle.cos()); } } } // NOTE:Disabled for i586 #[cfg(target_feature = "sse")] #[test] fn impl_f32x4_asin_acos() { let inc = 1.0 / 2501.0 / 4.0; for x in -2500..=2500 { let base = (x * 4) as f32 * inc; let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; let (actual_asins, actual_acoses) = f32x4::from(origs).asin_acos(); for i in 0..4 { let orig = origs[i]; let check = |name: &str, vals: f32x4, expected: f32| { let actual_arr: [f32; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("asin", actual_asins, orig.asin()); check("acos", actual_acoses, orig.acos()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "sse")] #[test] fn impl_f32x4_asin() { let inc = 1.0 / 2501.0 / 4.0; for x in -2500..=2500 { let base = (x * 4) as f32 * inc; let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; let actual_asins = f32x4::from(origs).asin(); for i in 0..4 { let orig = origs[i]; let check = |name: &str, vals: f32x4, expected: f32| { let actual_arr: [f32; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("asin", actual_asins, orig.asin()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "sse")] #[test] fn impl_f32x4_acos() { let inc = 1.0 / 2501.0 / 4.0; for x in -2500..=2500 { let base = (x * 4) as f32 * inc; let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; let actual_acoses = f32x4::from(origs).acos(); for i in 0..4 { let orig = origs[i]; let check = |name: &str, vals: f32x4, expected: f32| { let actual_arr: [f32; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("acos", actual_acoses, orig.acos()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "sse")] #[test] fn impl_f32x4_atan() { let inc = 1.0 / 2501.0 / 4.0; for x in -2500..=2500 { let base = (x * 4) as f32 * inc; let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; let actual_atans = f32x4::from(origs).atan(); for i in 0..4 { let orig = origs[i]; let check = |name: &str, vals: f32x4, expected: f32| { let actual_arr: [f32; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("atan", actual_atans, orig.atan()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "sse")] #[test] fn impl_f32x4_atan2() { let inc_y = 1.0 / 51.0 / 4.0; let inc_x = 1.0 / 2501.0 / 4.0; for y in -50..=50 { let base_y = (y * 4) as f32 * inc_y; let origs_y = [base_y, base_y + inc_y, base_y + 2.0 * inc_y, base_y + 3.0 * inc_y]; let actual_y = f32x4::from(origs_y); for x in -2500..=2500 { let base_x = (x * 4) as f32 * inc_x; let origs_x = [base_x, base_x + inc_x, base_x + 2.0 * inc_x, base_x + 3.0 * inc_x]; let actual_x = f32x4::from(origs_x); let actual_atan2s = actual_y.atan2(actual_x); for i in 0..4 { let orig_y = origs_y[i]; let orig_x = origs_x[i]; let check = |name: &str, vals: f32x4, expected: f32| { let actual_arr: [f32; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig_y}, {orig_x}) to be {expected} but got {actual}", name = name, orig_y = orig_y, orig_x = orig_x, expected = expected, actual = actual ); }; check("atan2", actual_atan2s, orig_y.atan2(orig_x)); } } } } #[test] fn impl_f32x4_to_degrees() { let pi = core::f32::consts::PI; let a = f32x4::from([0.0, pi / 2.0, pi, 2.0 * pi]); let expected = f32x4::from([0.0, 90.0, 180.0, 360.0]); let actual = a.to_degrees(); assert_eq!(expected, actual); } #[test] fn impl_f32x4_to_radians() { let pi = core::f32::consts::PI; let a = f32x4::from([0.0, 90.0, 180.0, 360.0]); let expected = f32x4::from([0.0, pi / 2.0, pi, 2.0 * pi]); let actual = a.to_radians(); assert_eq!(expected, actual); } #[test] fn impl_f32x4_recip() { { let expected = f32x4::from(0.0); let actual = f32x4::from(f32::INFINITY).recip(); assert_eq!(expected, actual); } { let expected = f32x4::from(0.0); let actual = f32x4::from(-f32::INFINITY).recip(); assert_eq!(expected, actual); } { let actual = f32x4::from(f32::NAN).recip(); assert!(actual.is_nan().any()); } { let expected = f32x4::from(f32::INFINITY); let actual = f32x4::from(0.0).recip(); assert_eq!(expected, actual); } { let expected = f32x4::from(0.49987793); let actual = f32x4::from(2.0).recip(); let diff: [f32; 4] = cast((actual - expected).abs()); assert!(diff[0] < 0.001); } { let expected = f32x4::from(-0.08102417); let actual = f32x4::from(-12.34).recip(); let diff: [f32; 4] = cast((actual - expected).abs()); assert!(diff[0] < 0.001); } } #[test] fn impl_f32x4_recip_sqrt() { { let expected = f32x4::from(0.0); let actual = f32x4::from(f32::INFINITY).recip_sqrt(); assert_eq!(expected, actual); } { let actual = f32x4::from(-f32::INFINITY).recip_sqrt(); assert!(actual.is_nan().any()); } { let actual = f32x4::from(f32::NAN).recip_sqrt(); assert!(actual.is_nan().any()); } { let expected = f32x4::from(f32::INFINITY); let actual = f32x4::from(0.0).recip_sqrt(); assert_eq!(expected, actual); } { let expected = f32x4::from(0.70703125); let actual = f32x4::from(2.0).recip_sqrt(); let diff: [f32; 4] = cast((actual - expected).abs()); assert!(diff[0] < 0.001); } { let actual = f32x4::from(-12.34).recip_sqrt(); assert!(actual.is_nan().any()); } } #[test] fn impl_f32x4_sqrt() { for (f, e) in [ (f32::INFINITY, f32::INFINITY), (0.0, 0.0), (-0.0, -0.0), (4.0, 2.0), (9.0, 3.0), (16.0, 4.0), (25.0, 5.0), (5000.0 * 5000.0, 5000.0), ] .iter() .copied() { let expected = f32x4::from(e); let actual = f32x4::from(f).sqrt(); assert_eq!(expected, actual); } assert_eq!( cast::<_, i32x4>(f32x4::from(f32::NAN).sqrt().is_nan()), i32x4::from(-1) ); assert_eq!( cast::<_, i32x4>(f32x4::from(f32::NEG_INFINITY).sqrt().is_nan()), i32x4::from(-1) ); assert_eq!( cast::<_, i32x4>(f32x4::from(-1.0).sqrt().is_nan()), i32x4::from(-1) ); } #[test] fn impl_f32x4_exp() { for f in [(-2.0), (-1.0), (0.0), (1.0), (1.5), (2.0), (10.0)].iter().copied() { let expected = f32x4::from((f as f32).exp()); let actual = f32x4::from(f).exp(); let diff_from_std: [f32; 4] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000000000000001); } } #[test] fn test_f32x4_move_mask() { let a = f32x4::from([-1.0, 0.0, -2.0, -3.0]); let expected = 0b1101; let actual = a.move_mask(); assert_eq!(expected, actual); // let a = f32x4::from([1.0, 0.0, 2.0, -3.0]); let expected = 0b1000; let actual = a.move_mask(); assert_eq!(expected, actual); } #[test] fn test_f32x4_any() { let a = f32x4::from([-1.0, 0.0, -2.0, f32::NAN]).is_nan(); assert!(a.any()); // let a = f32x4::from([1.0, 0.0, 2.0, 3.0]).is_nan(); assert!(!a.any()); } #[test] fn test_f32x4_all() { let a = f32x4::from([f32::NAN; 4]).is_nan(); assert!(a.all()); // let a = f32x4::from([1.0, -0.0, 2.0, f32::NAN]).is_nan(); assert!(!a.all()); } #[test] fn test_f32x4_none() { let a = f32x4::from([1.0, 0.0, 2.0, 3.0]).is_nan(); assert!(a.none()); // let a = f32x4::from([1.0, -0.0, 2.0, f32::NAN]).is_nan(); assert!(!a.none()); } #[test] fn impl_f32x4_ln() { for f in [0.1, 0.5, 1.0, 2.718282, 10.0, 35.0, 1250.0].iter().copied() { let expected = f32x4::from((f as f32).ln()); let actual = f32x4::from(f).ln(); let diff_from_std: [f32; 4] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000001); } } #[test] fn impl_f32x4_pow() { for f in [0.1, 0.5, 1.0, 2.718282, 3.0, 4.0, 2.5, -1.0].iter().copied() { let expected = f32x4::splat(2.0 as f32).powf(f); let actual = f32x4::from(2.0_f32.powf(f)); let diff_from_std: [f32; 4] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000001); } } #[test] fn impl_f32x4_pow_n() { let p = f32x4::from([29.0, 0.1, 0.5, 1.0]); let f = f32x4::from([1.2, 2.0, 3.0, 1.5]); let res = f.pow_f32x4(p); let p: [f32; 4] = cast(p); let f: [f32; 4] = cast(f); let res: [f32; 4] = cast(res); for i in 0..p.len() { let expected = f[i].powf(p[i]); if !(expected.is_nan() && res[i].is_nan()) { assert!((expected - res[i]).abs() < 0.0001); } } let p = f32x4::from([2.718282, -0.2, -1.5, 3.4]); let f = f32x4::from([9.2, 6.1, 2.5, -4.5]); let res = f.pow_f32x4(p); let p: [f32; 4] = cast(p); let f: [f32; 4] = cast(f); let res: [f32; 4] = cast(res); for i in 0..p.len() { let expected = f[i].powf(p[i]); if !(expected.is_nan() && res[i].is_nan()) { assert!((expected - res[i]).abs() < 0.0001); } } } #[test] fn impl_f32x4_reduce_add() { let p = f32x4::splat(0.001); assert_eq!(p.reduce_add(), 0.004); } #[test] fn impl_f32x4_sum() { let mut p = Vec::with_capacity(250_000); for _ in 0..250_000 { p.push(f32x4::splat(0.001)); } let now = std::time::Instant::now(); let sum: f32 = p.iter().map(|x| x.reduce_add()).sum(); let duration = now.elapsed().as_micros(); println!("Time take {} {}us", sum, duration); let p = vec![0.001; 1_000_000]; let now = std::time::Instant::now(); let sum2: f32 = p.iter().sum(); let duration = now.elapsed().as_micros(); println!("Time take {} {}us", sum2, duration); } #[test] fn impl_f32x4_from_i32x4() { let i = i32x4::from([1, 2, 3, 4]); let f = f32x4::from([1.0, 2.0, 3.0, 4.0]); assert_eq!(f32x4::from_i32x4(i), f) } #[cfg(feature = "serde")] #[test] fn impl_f32x4_ser_de_roundtrip() { let serialized = bincode::serialize(&f32x4::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(f32x4::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_f32x8.rs000066400000000000000000000645521473735473700174720ustar00rootroot00000000000000use wide::*; use bytemuck::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] fn impl_debug_for_f32x8() { let expected = "(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0)"; let actual = format!("{:?}", f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])); assert_eq!(expected, actual); let expected = "(1.000, 2.000, 3.000, 4.000, 5.000, 6.000, 7.000, 8.000)"; let actual = format!("{:.3?}", f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])); assert_eq!(expected, actual); } #[test] fn impl_add_for_f32x8() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let b = f32x8::from([5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]); let expected = f32x8::from([6.0, 8.0, 10.0, 12.0, 14.0, 16.0, 18.0, 20.0]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_add_const_for_f32x8() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let expected = f32x8::from([6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0]); let actual = a + 5.0; assert_eq!(expected, actual); } #[test] fn impl_sub_const_for_f32x8() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let expected = f32x8::from([-1.0, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]); let actual = a - 2.0; assert_eq!(expected, actual); } #[test] fn impl_mul_const_for_f32x8() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let expected = f32x8::from([2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]); let actual = a * 2.0; assert_eq!(expected, actual); } #[test] fn impl_div_const_for_f32x8() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let expected = f32x8::from([0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]); let actual = a / 2.0; assert_eq!(expected, actual); } #[test] fn impl_sub_for_f32x8() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let b = f32x8::from([5.0, 7.0, 17.0, 1.0, 1.0, 9.0, 2.0, 6.0]); let expected = f32x8::from([-4.0, -5.0, -14.0, 3.0, 4.0, -3.0, 5.0, 2.0]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_f32x8() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let b = f32x8::from([5.0, 7.0, 17.0, 1.0, 5.0, 6.0, 7.0, 8.0]); let expected = f32x8::from([5.0, 14.0, 51.0, 4.0, 25.0, 36.0, 49.0, 64.0]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_div_for_f32x8() { let a = f32x8::from([4.0, 9.0, 10.0, 12.0, 5.0, 6.0, 7.0, 8.0]); let b = f32x8::from([2.0, 2.0, 5.0, -3.0, 2.0, 1.5, 3.0, 2.5]); let expected = f32x8::from([2.0, 4.5, 2.0, -4.0, 2.5, 4.0, 2.3333333, 3.2]); let actual = a / b; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_f32x8() { let a = f32x8::from([0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0]); let b = f32x8::from([0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0]); let expected = f32x8::from([0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_f32x8() { let a = f32x8::from([0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0]); let b = f32x8::from([0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0]); let expected = f32x8::from([0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_f32x8() { let a = f32x8::from([0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0]); let b = f32x8::from([0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0]); let expected = f32x8::from([0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_f32x8_cmp_eq() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 2.0, 1.0]); let b = f32x8::from([2.0; 8]); let expected: [i32; 8] = [0, -1, 0, 0, 0, 0, -1, 0]; let actual: [i32; 8] = cast(a.cmp_eq(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x8_cmp_ne() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 2.0, 1.0]); let b = f32x8::from([2.0; 8]); let expected: [i32; 8] = [-1, 0, -1, -1, -1, -1, 0, -1]; let actual: [i32; 8] = cast(a.cmp_ne(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x8_cmp_ge() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 2.0, 1.0]); let b = f32x8::from([2.0; 8]); let expected: [i32; 8] = [0, -1, -1, -1, -1, -1, -1, 0]; let actual: [i32; 8] = cast(a.cmp_ge(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x8_cmp_gt() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 3.0, 1.0]); let b = f32x8::from([3.0; 8]); let expected: [i32; 8] = [0, 0, 0, -1, -1, -1, 0, 0]; let actual: [i32; 8] = cast(a.cmp_gt(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x8_cmp_le() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 2.0, 1.0]); let b = f32x8::from([4.0; 8]); let expected: [i32; 8] = [-1, -1, -1, -1, 0, 0, -1, -1]; let actual: [i32; 8] = cast(a.cmp_le(b)); assert_eq!(expected, actual); } #[test] fn impl_f32x8_cmp_lt() { let a = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 2.0, 1.0]); let b = f32x8::from([3.0; 8]); let expected: [i32; 8] = [-1, -1, 0, 0, 0, 0, -1, -1]; let actual: [i32; 8] = cast(a.cmp_lt(b)); assert_eq!(expected, actual); let expected: [i32; 8] = [0, 0, 0, 0, 0, 0, 0, 0]; let actual: [i32; 8] = cast(a.cmp_lt(a)); assert_eq!(expected, actual); } #[test] fn impl_f32x8_blend() { let use_t: f32 = f32::from_bits(u32::MAX); let t = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let f = f32x8::from([5.0, 6.0, 7.0, 8.0, 21.0, 22.0, 23.0, 24.0]); let mask = f32x8::from([use_t, 0.0, use_t, 0.0, 0.0, 0.0, 0.0, use_t]); let expected = f32x8::from([1.0, 6.0, 3.0, 8.0, 21.0, 22.0, 23.0, 8.0]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_f32x8_abs() { let a = f32x8::from([-1.0, 2.0, -3.5, f32::NEG_INFINITY, 6.0, 15.0, -19.0, -9.0]); let expected = f32x8::from([1.0, 2.0, 3.5, f32::INFINITY, 6.0, 15.0, 19.0, 9.0]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_f32x8_floor() { let a = f32x8::from([-1.1, 60.9, 1.1, f32::INFINITY, 96.6, -53.2, 0.1, 9.2]); let expected = f32x8::from([-2.0, 60.0, 1.0, f32::INFINITY, 96.0, -54.0, 0.0, 9.0]); let actual = a.floor(); assert_eq!(expected, actual); } #[test] fn impl_f64x4_ceil() { let a = f32x8::from([-1.1, 60.9, 1.1, f32::NEG_INFINITY, 96.6, -53.2, 0.1, 9.2]); let expected = f32x8::from([-1.0, 61.0, 2.0, f32::NEG_INFINITY, 97.0, -53.0, 1.0, 10.0]); let actual = a.ceil(); assert_eq!(expected, actual); } #[test] fn impl_f32x8_fast_max() { let a = f32x8::from([1.0, 5.0, 3.0, 0.0, 6.0, -8.0, 12.0, 9.0]); let b = f32x8::from([2.0, -3.0, f32::INFINITY, 10.0, 19.0, -5.0, -1.0, -9.0]); let expected = f32x8::from([2.0, 5.0, f32::INFINITY, 10.0, 19.0, -5.0, 12.0, 9.0]); let actual = a.fast_max(b); assert_eq!(expected, actual); } #[test] fn impl_f32x8_max() { let a = f32x8::from([1.0, 5.0, 3.0, f32::NAN, 6.0, -8.0, 12.0, f32::NAN]); let b = f32x8::from([2.0, -3.0, f32::INFINITY, 10.0, 19.0, f32::NAN, -1.0, -9.0]); let expected = f32x8::from([2.0, 5.0, f32::INFINITY, 10.0, 19.0, -8.0, 12.0, -9.0]); let actual = a.max(b); assert_eq!(expected, actual); } #[test] fn impl_f32x8_fast_min() { let a = f32x8::from([1.0, 5.0, 3.0, f32::NEG_INFINITY, 6.0, -8.0, 12.0, 9.0]); let b = f32x8::from([2.0, -3.0, f32::INFINITY, 10.0, 19.0, -5.0, -1.0, -9.0]); let expected = f32x8::from([1.0, -3.0, 3.0, f32::NEG_INFINITY, 6.0, -8.0, -1.0, -9.0]); let actual = a.fast_min(b); assert_eq!(expected, actual); } #[test] fn impl_f32x8_min() { let a = f32x8::from([1.0, 5.0, 3.0, f32::NEG_INFINITY, 6.0, -8.0, 12.0, f32::NAN]); let b = f32x8::from([2.0, -3.0, f32::INFINITY, 10.0, 19.0, f32::NAN, -1.0, -9.0]); let expected = f32x8::from([1.0, -3.0, 3.0, f32::NEG_INFINITY, 6.0, -8.0, -1.0, -9.0]); let actual = a.min(b); assert_eq!(expected, actual); } #[test] fn impl_f32x8_is_nan() { let a = f32x8::from([0.0, f32::NAN, f32::NAN, 0.0, 0.0, 0.0, f32::NAN, 0.0]); let expected: [u32; 8] = [0, u32::MAX, u32::MAX, 0, 0, 0, u32::MAX, 0]; let actual: [u32; 8] = cast(a.is_nan()); assert_eq!(expected, actual); } #[test] fn impl_f32x8_is_finite() { let a = f32x8::from([ f32::NAN, 1.0, f32::INFINITY, f32::NEG_INFINITY, 2.0, 5.0, f32::INFINITY, 9.0, ]); let expected: [u32; 8] = [0, u32::MAX, 0, 0, u32::MAX, u32::MAX, 0, u32::MAX]; let actual: [u32; 8] = cast(a.is_finite()); assert_eq!(expected, actual); } #[test] fn impl_f32x8_round() { let a = f32x8::from([1.1, 2.5, 3.7, 4.0, 7.2, 10.5, 12.7, 35.12]); let expected = f32x8::from([1.0, 2.0, 4.0, 4.0, 7.0, 10.0, 13.0, 35.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f32x8::from([-1.1, -2.5, -3.7, -4.0, -7.2, -10.5, -12.7, -35.12]); let expected = f32x8::from([-1.0, -2.0, -4.0, -4.0, -7.0, -10.0, -13.0, -35.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f32x8::from([ f32::INFINITY, f32::NEG_INFINITY, 5.5, 5.0, 7.2, 10.5, 12.7, 35.12, ]); let expected = f32x8::from([ f32::INFINITY, f32::NEG_INFINITY, 6.0, 5.0, 7.0, 10.0, 13.0, 35.0, ]); let actual = a.round(); assert_eq!(expected, actual); // let a = f32x8::from(f32::NAN); let expected: [u32; 8] = [u32::MAX; 8]; let actual: [u32; 8] = cast(a.round().is_nan()); assert_eq!(expected, actual); // let a = f32x8::from(-0.0); let expected = a; let actual = a.round(); assert_eq!(expected, actual); } #[test] fn impl_f32x8_fast_round_int() { for (f, i) in [(1.0, 1), (1.1, 1), (-2.1, -2), (2.5, 2), (0.0, 0), (-0.0, 0)] .iter() .copied() { let a = f32x8::from(f); let expected = i32x8::from(i); let actual = a.fast_round_int(); assert_eq!(expected, actual); } } #[test] fn impl_f32x8_round_int() { for (f, i) in [ (1.0, 1), (1.1, 1), (-2.1, -2), (2.5, 2), (0.0, 0), (-0.0, 0), (f32::NAN, 0), (f32::INFINITY, i32::MAX), (f32::NEG_INFINITY, i32::MIN), ] .iter() .copied() { let a = f32x8::from(f); let expected = i32x8::from(i); let actual = a.round_int(); assert_eq!(expected, actual); } } #[test] fn impl_f32x8_fast_trunc_int() { for (f, i) in [(1.0, 1), (1.1, 1), (-2.1, -2), (2.5, 2), (3.7, 3), (-0.0, 0)] .iter() .copied() { let a = f32x8::from(f); let expected = i32x8::from(i); let actual = a.fast_trunc_int(); assert_eq!(expected, actual); } } #[test] fn impl_f32x8_trunc_int() { for (f, i) in [ (1.0, 1), (1.1, 1), (-2.1, -2), (2.5, 2), (3.7, 3), (-0.0, 0), (f32::NAN, 0), (f32::INFINITY, i32::MAX), (f32::NEG_INFINITY, i32::MIN), ] .iter() .copied() { let a = f32x8::from(f); let expected = i32x8::from(i); let actual = a.trunc_int(); assert_eq!(expected, actual); } } #[test] fn impl_f32x8_mul_add() { let a = f32x8::from([2.0, 3.0, 4.0, 5.0, 6.7, 9.2, 11.5, 12.2]); let b = f32x8::from([4.0, 5.0, 6.0, 7.0, 1.5, 8.9, 4.2, 5.6]); let c = f32x8::from([1.0; 8]); let expected: [f32; 8] = cast(f32x8::from([9.0, 16.0, 25.0, 36.0, 11.05, 82.88, 49.3, 69.32])); let actual: [f32; 8] = cast(a.mul_add(b, c)); for (act, exp) in actual.iter().zip(expected.iter()) { assert!((exp - act).abs() < 0.000001); } } #[test] fn impl_f32x8_mul_neg_add() { let a = f32x8::from([2.0, 3.0, 4.0, 5.0, 6.7, 9.2, 11.5, 12.2]); let b = f32x8::from([4.0, 5.0, 6.0, 7.0, 1.5, 8.9, 4.2, -5.6]); let c = f32x8::from([1.0; 8]); let expected: [f32; 8] = cast(f32x8::from([-7.0, -14.0, -23.0, -34.0, -9.05, -80.88, -47.3, 69.32])); let actual: [f32; 8] = cast(a.mul_neg_add(b, c)); for (act, exp) in actual.iter().zip(expected.iter()) { assert!((exp - act).abs() < 0.00001); } } #[test] fn impl_f32x8_flip_signs() { let a = f32x8::from([1.0, 1.0, -1.0, -1.0, 5.2, 6.7, -8.2, -12.5]); let b = f32x8::from([2.0, -3.0, 4.0, -5.0, 5.2, 6.7, -8.2, -12.5]); let expected = f32x8::from([1.0, -1.0, -1.0, 1.0, 5.2, 6.7, 8.2, 12.5]); let actual = a.flip_signs(b); assert_eq!(expected, actual); } #[test] fn impl_f32x8_copysign() { let a = f32x8::from([1.0, 1.0, -1.0, -1.0, 5.2, 6.7, -8.2, -12.5]); let b = f32x8::from([2.0, -3.0, 4.0, -5.0, 5.2, 6.7, -8.2, -12.5]); let expected = f32x8::from([1.0, -1.0, 1.0, -1.0, 5.2, 6.7, -8.2, -12.5]); let actual = a.copysign(b); assert_eq!(expected, actual); } // NOTE: Disabled #[cfg(target_feature = "sse")] #[test] fn impl_f32x8_asin_acos() { let inc = 1.0 / 2501.0 / 8.0; for x in -2500..=2500 { let base = (x * 8) as f32 * inc; let origs = [ base, base + inc, base + 2.0 * inc, base + 3.0 * inc, base + 4.0 * inc, base + 5.0 * inc, base + 6.0 * inc, base + 7.0 * inc, ]; let (actual_asins, actual_acoses) = f32x8::from(origs).asin_acos(); for i in 0..8 { let orig = origs[i]; let check = |name: &str, vals: f32x8, expected: f32| { let actual_arr: [f32; 8] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("asin", actual_asins, orig.asin()); check("acos", actual_acoses, orig.acos()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "avx")] #[test] fn impl_f32x8_asin() { let inc = 1.0 / 2501.0 / 8.0; for x in -2500..=2500 { let base = (x * 4) as f32 * inc; let origs = [ base, base + inc, base + 2.0 * inc, base + 3.0 * inc, base + 4.0 * inc, base + 5.0 * inc, base + 6.0 * inc, base + 7.0 * inc, ]; let actual_asins = f32x8::from(origs).asin(); for i in 0..8 { let orig = origs[i]; let check = |name: &str, vals: f32x8, expected: f32| { let actual_arr: [f32; 8] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("asin", actual_asins, orig.asin()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "avx")] #[test] fn impl_f32x8_acos() { let inc = 1.0 / 2501.0 / 8.0; for x in -2500..=2500 { let base = (x * 8) as f32 * inc; let origs = [ base, base + inc, base + 2.0 * inc, base + 3.0 * inc, base + 4.0 * inc, base + 5.0 * inc, base + 6.0 * inc, base + 7.0 * inc, ]; let actual_acoses = f32x8::from(origs).acos(); for i in 0..8 { let orig = origs[i]; let check = |name: &str, vals: f32x8, expected: f32| { let actual_arr: [f32; 8] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("acos", actual_acoses, orig.acos()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "avx")] #[test] fn impl_f32x8_atan() { let inc = 1.0 / 2501.0 / 8.0; for x in -2500..=2500 { let base = (x * 8) as f32 * inc; let origs = [ base, base + inc, base + 2.0 * inc, base + 3.0 * inc, base + 4.0 * inc, base + 5.0 * inc, base + 6.0 * inc, base + 7.0 * inc, ]; let actual_atans = f32x8::from(origs).atan(); for i in 0..8 { let orig = origs[i]; let check = |name: &str, vals: f32x8, expected: f32| { let actual_arr: [f32; 8] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("atan", actual_atans, orig.atan()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "avx")] #[test] fn impl_f32x8_atan2() { let inc_y = 1.0 / 51.0 / 8.0; let inc_x = 1.0 / 2501.0 / 8.0; for y in -50..=50 { let base_y = (y * 8) as f32 * inc_y; let origs_y = [ base_y, base_y + inc_y, base_y + 2.0 * inc_y, base_y + 3.0 * inc_y, base_y + 4.0 * inc_y, base_y + 5.0 * inc_y, base_y + 6.0 * inc_y, base_y + 7.0 * inc_y, ]; let actual_y = f32x8::from(origs_y); for x in -2500..=2500 { let base_x = (x * 8) as f32 * inc_x; let origs_x = [ base_x, base_x + inc_x, base_x + 2.0 * inc_x, base_x + 3.0 * inc_x, base_x + 4.0 * inc_x, base_x + 5.0 * inc_x, base_x + 6.0 * inc_x, base_x + 7.0 * inc_x, ]; let actual_x = f32x8::from(origs_x); let actual_atan2s = actual_y.atan2(actual_x); for i in 0..8 { let orig_y = origs_y[i]; let orig_x = origs_x[i]; let check = |name: &str, vals: f32x8, expected: f32| { let actual_arr: [f32; 8] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig_y}, {orig_x}) to be {expected} but got {actual}", name = name, orig_y = orig_y, orig_x = orig_x, expected = expected, actual = actual ); }; check("atan2", actual_atan2s, orig_y.atan2(orig_x)); } } } } #[test] fn impl_f32x8_sin_cos() { for x in -2500..=2500 { let base = (x * 4) as f32; let angles = [ base, base + 1.0, base + 2.0, base + 3.0, base + 4.0, base + 5.0, base + 6.0, base + 7.0, ]; let (actual_sins, actual_coses) = f32x8::from(angles).sin_cos(); for i in 0..4 { let angle = angles[i]; let check = |name: &str, vals: f32x8, expected: f32| { let actual_arr: [f32; 8] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000002, "Wanted {name}({angle}) to be {expected} but got {actual}", name = name, angle = angle, expected = expected, actual = actual ); }; check("sin", actual_sins, angle.sin()); check("cos", actual_coses, angle.cos()); } } } #[test] fn impl_f32x8_to_degrees() { let pi = core::f32::consts::PI; let a = f32x8::from([0.0, pi / 2.0, pi, 2.0 * pi, 0.0, pi / 2.0, pi, 2.0 * pi]); let expected = f32x8::from([0.0, 90.0, 180.0, 360.0, 0.0, 90.0, 180.0, 360.0]); let actual = a.to_degrees(); assert_eq!(expected, actual); } #[test] fn impl_f32x8_to_radians() { let pi = core::f32::consts::PI; let a = f32x8::from([0.0, 90.0, 180.0, 360.0, 0.0, 90.0, 180.0, 360.0]); let expected = f32x8::from([0.0, pi / 2.0, pi, 2.0 * pi, 0.0, pi / 2.0, pi, 2.0 * pi]); let actual = a.to_radians(); assert_eq!(expected, actual); } #[test] fn impl_f32x8_recip() { { let expected = f32x8::from(0.0); let actual = f32x8::from(f32::INFINITY).recip(); assert_eq!(expected, actual); } { let expected = f32x8::from(0.0); let actual = f32x8::from(-f32::INFINITY).recip(); assert_eq!(expected, actual); } { let actual = f32x8::from(f32::NAN).recip(); assert!(actual.is_nan().any()); } { let expected = f32x8::from(f32::INFINITY); let actual = f32x8::from(0.0).recip(); assert_eq!(expected, actual); } { let expected = f32x8::from(0.49987793); let actual = f32x8::from(2.0).recip(); let diff: [f32; 8] = cast((actual - expected).abs()); assert!(diff[0] < 0.001); } { let expected = f32x8::from(-0.08102417); let actual = f32x8::from(-12.34).recip(); let diff: [f32; 8] = cast((actual - expected).abs()); assert!(diff[0] < 0.001); } } #[test] fn impl_f32x8_recip_sqrt() { { let expected = f32x8::from(0.0); let actual = f32x8::from(f32::INFINITY).recip_sqrt(); assert_eq!(expected, actual); } { let actual = f32x8::from(-f32::INFINITY).recip_sqrt(); assert!(actual.is_nan().any()); } { let actual = f32x8::from(f32::NAN).recip_sqrt(); assert!(actual.is_nan().any()); } { let expected = f32x8::from(f32::INFINITY); let actual = f32x8::from(0.0).recip_sqrt(); assert_eq!(expected, actual); } { let expected = f32x8::from(0.70703125); let actual = f32x8::from(2.0).recip_sqrt(); let diff: [f32; 8] = cast((actual - expected).abs()); assert!(diff[0] < 0.001); } { let actual = f32x8::from(-12.34).recip_sqrt(); assert!(actual.is_nan().any()); } } #[test] fn impl_f32x8_sqrt() { for (f, e) in [ (f32::INFINITY, f32::INFINITY), (0.0, 0.0), (-0.0, -0.0), (4.0, 2.0), (9.0, 3.0), (16.0, 4.0), (25.0, 5.0), (5000.0 * 5000.0, 5000.0), ] .iter() .copied() { let expected = f32x8::from(e); let actual = f32x8::from(f).sqrt(); assert_eq!(expected, actual); } assert_eq!( cast::<_, i32x8>(f32x8::from(f32::NAN).sqrt().is_nan()), i32x8::from(-1) ); assert_eq!( cast::<_, i32x8>(f32x8::from(f32::NEG_INFINITY).sqrt().is_nan()), i32x8::from(-1) ); assert_eq!( cast::<_, i32x8>(f32x8::from(-1.0).sqrt().is_nan()), i32x8::from(-1) ); } #[test] fn impl_f32x8_exp() { for f in [(-2.0), (-1.0), (0.0), (1.0), (1.5), (2.0), (10.0)].iter().copied() { let expected = f32x8::from((f as f32).exp()); let actual = f32x8::from(f).exp(); let diff_from_std: [f32; 8] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000000000000001); } } #[test] fn test_f32x8_move_mask() { let a = f32x8::from([-1.0, 0.0, -2.0, -3.0, -1.0, 0.0, -2.0, -3.0]); let expected = 0b11011101; let actual = a.move_mask(); assert_eq!(expected, actual); // let a = f32x8::from([1.0, 0.0, 2.0, -3.0, 1.0, 0.0, 2.0, -3.0]); let expected = 0b10001000; let actual = a.move_mask(); assert_eq!(expected, actual); } #[test] fn test_f32x8_any() { let a = f32x8::from([-1.0, 0.0, -2.0, -3.0, 2.0, -1.0, -2.0, f32::NAN]).is_nan(); assert!(a.any()); // let a = f32x8::from([1.0, 0.0, 2.0, 3.0, 2.0, 5.0, 6.7, 7.1]).is_nan(); assert!(!a.any()); } #[test] fn test_f32x8_all() { let a = f32x8::from([f32::NAN; 8]).is_nan(); assert!(a.all()); // let a = f32x8::from([1.0, -0.0, 2.0, 3.0, 4.0, 9.0, 7.2, f32::NAN]).is_nan(); assert!(!a.all()); } #[test] fn test_f32x8_none() { let a = f32x8::from([1.0, 0.0, 2.0, 3.0, 1.0, 0.0, 2.0, 3.0]).is_nan(); assert!(a.none()); // let a = f32x8::from([1.0, -0.0, 2.0, 3.0, 1.0, -0.0, 2.0, f32::NAN]).is_nan(); assert!(!a.none()); } #[test] fn impl_f32x8_ln() { for f in [0.1, 0.5, 1.0, 2.718282, 10.0, 35.0, 1250.0].iter().copied() { let expected = f32x8::from((f as f32).ln()); let actual = f32x8::from(f).ln(); let diff_from_std: [f32; 8] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.0000001); } } #[test] fn impl_f32x8_pow() { for f in [0.1, 0.5, 1.0, 2.718282, 3.0, 4.0, 2.5, -1.0].iter().copied() { let expected = f32x8::splat(2.0 as f32).powf(f); let actual = f32x8::from(2.0_f32.powf(f)); let diff_from_std: [f32; 8] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000001); } } #[test] fn impl_f32x8_pow_n() { let p = f32x8::from([29.0, 0.1, 0.5, 1.0, 2.718282, -0.2, -1.5, 3.4]); let f = f32x8::from([1.2, 2.0, 3.0, 1.5, 9.2, 6.1, 2.5, -4.5]); let res = f.pow_f32x8(p); let p: [f32; 8] = cast(p); let f: [f32; 8] = cast(f); let res: [f32; 8] = cast(res); for i in 0..p.len() { let expected = f[i].powf(p[i]); if !(expected.is_nan() && res[i].is_nan()) { assert!((expected - res[i]).abs() < 0.0001); } } } #[test] fn impl_f32x8_reduce_add() { let p = f32x8::from([0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.009]); assert!((p.reduce_add() - 0.037) < 0.000000001); } #[test] fn impl_f32x8_sum() { let mut p = Vec::with_capacity(250_000); for _ in 0..125_000 { p.push(f32x8::splat(0.001)); } let now = std::time::Instant::now(); let sum: f32 = p.iter().map(|x| x.reduce_add()).sum(); let duration = now.elapsed().as_micros(); println!("Time take {} {}us", sum, duration); let p = vec![0.001; 1_000_000]; let now = std::time::Instant::now(); let sum2: f32 = p.iter().sum(); let duration = now.elapsed().as_micros(); println!("Time take {} {}us", sum2, duration); } #[test] fn impl_transpose_for_f32x8() { let a = [ f32x8::new([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1]), f32x8::new([8.1, 9.1, 10.1, 11.1, 12.1, 13.1, 14.1, 15.1]), f32x8::new([16.1, 17.1, 18.1, 19.1, 20.1, 21.1, 22.1, 23.1]), f32x8::new([24.1, 25.1, 26.1, 27.1, 28.1, 29.1, 30.1, 31.1]), f32x8::new([32.1, 33.1, 34.1, 35.1, 36.1, 37.1, 38.1, 39.1]), f32x8::new([40.1, 41.1, 42.1, 43.1, 44.1, 45.1, 46.1, 47.1]), f32x8::new([48.1, 49.1, 50.1, 51.1, 52.1, 53.1, 54.1, 55.1]), f32x8::new([ 5600000.1, 5700000.1, 5800000.1, 5900000.1, 6000000.1, 6100000.1, 6200000.1, 6300000.1, ]), ]; let result = f32x8::transpose(a); let expected = [ f32x8::new([0.1, 8.1, 16.1, 24.1, 32.1, 40.1, 48.1, 5600000.1]), f32x8::new([1.1, 9.1, 17.1, 25.1, 33.1, 41.1, 49.1, 5700000.1]), f32x8::new([2.1, 10.1, 18.1, 26.1, 34.1, 42.1, 50.1, 5800000.1]), f32x8::new([3.1, 11.1, 19.1, 27.1, 35.1, 43.1, 51.1, 5900000.1]), f32x8::new([4.1, 12.1, 20.1, 28.1, 36.1, 44.1, 52.1, 6000000.1]), f32x8::new([5.1, 13.1, 21.1, 29.1, 37.1, 45.1, 53.1, 6100000.1]), f32x8::new([6.1, 14.1, 22.1, 30.1, 38.1, 46.1, 54.1, 6200000.1]), f32x8::new([7.1, 15.1, 23.1, 31.1, 39.1, 47.1, 55.1, 6300000.1]), ]; assert_eq!(result, expected); } #[test] fn impl_f32x8_from_i32x8() { let i = i32x8::from([1, 2, 3, 4, 5, 6, 7, 8]); let f = f32x8::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); assert_eq!(f32x8::from_i32x8(i), f) } #[cfg(feature = "serde")] #[test] fn impl_f32x8_ser_de_roundtrip() { let serialized = bincode::serialize(&f32x8::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(f32x8::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_f64x2.rs000066400000000000000000000542031473735473700174610ustar00rootroot00000000000000use core::f64; use wide::*; use bytemuck::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_add_for_f64x2() { let a = f64x2::from([1.0, 2.0]); let b = f64x2::from([5.0, 6.0]); let expected = f64x2::from([6.0, 8.0]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_f64x2() { let a = f64x2::from([1.0, 2.0]); let b = f64x2::from([5.0, -10.0]); let expected = f64x2::from([-4.0, 12.0]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_f64x2() { let a = f64x2::from([1.0, 2.0]); let b = f64x2::from([5.0, -10.0]); let expected = f64x2::from([5.0, -20.0]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_div_for_f64x2() { let a = f64x2::from([50.0, 2.0]); let b = f64x2::from([5.0, -10.0]); let expected = f64x2::from([10.0, -0.2]); let actual = a / b; assert_eq!(expected, actual); } #[test] fn impl_sub_const_for_f64x2() { let a = f64x2::from([1.0, 2.0]); let expected = f64x2::from([-1.0, 0.0]); let actual = a - 2.0; assert_eq!(expected, actual); } #[test] fn impl_mul_const_for_f64x2() { let a = f64x2::from([1.0, 2.0]); let expected = f64x2::from([2.0, 4.0]); let actual = a * 2.0; assert_eq!(expected, actual); } #[test] fn impl_div_const_for_f64x2() { let a = f64x2::from([1.0, 2.0]); let expected = f64x2::from([0.5, 1.0]); let actual = a / 2.0; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_f64x2() { let a = f64x2::from([0.0, 1.0]); let b = f64x2::from([1.0, 1.0]); let expected = f64x2::from([0.0, 1.0]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_f64x2() { let a = f64x2::from([0.0, 1.0]); let b = f64x2::from([1.0, 1.0]); let expected = f64x2::from([1.0, 1.0]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_f64x2() { let a = f64x2::from([0.0, 1.0]); let b = f64x2::from([1.0, 1.0]); let expected = f64x2::from([1.0, 0.0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_f64x2_cmp_eq() { let a = f64x2::from([1.0, 2.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [0, -1]; let actual: [i64; 2] = cast(a.cmp_eq(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x2_cmp_ne() { let a = f64x2::from([1.0, 2.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [-1, 0]; let actual: [i64; 2] = cast(a.cmp_ne(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x2_cmp_ge() { let a = f64x2::from([1.0, 2.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [0, -1]; let actual: [i64; 2] = cast(a.cmp_ge(b)); assert_eq!(expected, actual); // let a = f64x2::from([3.0, 4.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [-1, -1]; let actual: [i64; 2] = cast(a.cmp_ge(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x2_cmp_gt() { let a = f64x2::from([1.0, 2.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [0, 0]; let actual: [i64; 2] = cast(a.cmp_gt(b)); assert_eq!(expected, actual); // let a = f64x2::from([3.0, 4.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [-1, -1]; let actual: [i64; 2] = cast(a.cmp_gt(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x2_cmp_le() { let a = f64x2::from([1.0, 2.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [-1, -1]; let actual: [i64; 2] = cast(a.cmp_le(b)); assert_eq!(expected, actual); // let a = f64x2::from([3.0, 4.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [0, 0]; let actual: [i64; 2] = cast(a.cmp_le(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x2_cmp_lt() { let a = f64x2::from([1.0, 2.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [-1, 0]; let actual: [i64; 2] = cast(a.cmp_lt(b)); assert_eq!(expected, actual); // let a = f64x2::from([3.0, 4.0]); let b = f64x2::from([2.0, 2.0]); let expected: [i64; 2] = [0, 0]; let actual: [i64; 2] = cast(a.cmp_lt(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x2_const_cmp_lt() { let a = f64x2::from([1.0, 2.0]); let expected: [i64; 2] = [-1, 0]; let actual: [i64; 2] = cast(a.cmp_lt(2.0)); assert_eq!(expected, actual); // let a = f64x2::from([3.0, 4.0]); let expected: [i64; 2] = [0, 0]; let actual: [i64; 2] = cast(a.cmp_lt(2.0)); assert_eq!(expected, actual); let a = f64x2::from([3.0, 4.0]); let expected: [i64; 2] = [0, 0]; let actual: [i64; 2] = cast(a.cmp_lt(a)); assert_eq!(expected, actual); } #[test] fn impl_f64x2_blend() { let use_t: f64 = f64::from_bits(u64::MAX); let t = f64x2::from([1.0, 2.0]); let f = f64x2::from([5.0, 6.0]); let mask = f64x2::from([use_t, 0.0]); let expected = f64x2::from([1.0, 6.0]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_f64x2_abs() { let a = f64x2::from([-1.0, 2.0]); let expected = f64x2::from([1.0, 2.0]); let actual = a.abs(); assert_eq!(expected, actual); // let a = f64x2::from([-3.5, f64::NEG_INFINITY]); let expected = f64x2::from([3.5, f64::INFINITY]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_f64x2_floor() { let a = f64x2::from([-1.1, 2.0]); let expected = f64x2::from([-2.0, 2.0]); let actual = a.floor(); assert_eq!(expected, actual); // let a = f64x2::from([60.9, f64::INFINITY]); let expected = f64x2::from([60.0, f64::INFINITY]); let actual = a.floor(); assert_eq!(expected, actual); } #[test] fn impl_f64x2_ceil() { let a = f64x2::from([-1.1, 2.0]); let expected = f64x2::from([-1.0, 2.0]); let actual = a.ceil(); assert_eq!(expected, actual); // let a = f64x2::from([60.9, f64::NEG_INFINITY]); let expected = f64x2::from([61.0, f64::NEG_INFINITY]); let actual = a.ceil(); assert_eq!(expected, actual); } #[test] fn impl_f64x2_fast_max() { let a = f64x2::from([-0.0, -5.0]); let b = f64x2::from([0.0, 3.0]); let expected = f64x2::from([0.0, 3.0]); let actual = a.fast_max(b); assert_eq!(expected, actual); let a = f64x2::from([f64::NEG_INFINITY, 5.0]); let b = f64x2::from([2.0, f64::INFINITY]); let expected = f64x2::from([2.0, f64::INFINITY]); let actual = a.fast_max(b); assert_eq!(expected, actual); } #[test] fn impl_f64x2_max() { let a = f64x2::from([-0.0, -5.0]); let b = f64x2::from([0.0, 3.0]); let expected = f64x2::from([0.0, 3.0]); let actual = a.max(b); assert_eq!(expected, actual); let a = f64x2::from([f64::NEG_INFINITY, 5.0]); let b = f64x2::from([2.0, f64::INFINITY]); let expected = f64x2::from([2.0, f64::INFINITY]); let actual = a.max(b); assert_eq!(expected, actual); let a = f64x2::from([f64::NAN, 5.0]); let b = f64x2::from([2.0, f64::NAN]); let expected = f64x2::from([2.0, 5.0]); let actual = a.max(b); assert_eq!(expected, actual); } #[test] fn impl_f64x2_fast_min() { let a = f64x2::from([-0.0, -5.0]); let b = f64x2::from([0.0, 3.0]); let expected = f64x2::from([-0.0, -5.0]); let actual = a.fast_min(b); assert_eq!(expected, actual); let a = f64x2::from([f64::NEG_INFINITY, 5.0]); let b = f64x2::from([2.0, f64::INFINITY]); let expected = f64x2::from([f64::NEG_INFINITY, 5.0]); let actual = a.fast_min(b); assert_eq!(expected, actual); } #[test] fn impl_f64x2_min() { let a = f64x2::from([-0.0, -5.0]); let b = f64x2::from([0.0, 3.0]); let expected = f64x2::from([-0.0, -5.0]); let actual = a.min(b); assert_eq!(expected, actual); let a = f64x2::from([f64::NEG_INFINITY, 5.0]); let b = f64x2::from([2.0, f64::INFINITY]); let expected = f64x2::from([f64::NEG_INFINITY, 5.0]); let actual = a.min(b); assert_eq!(expected, actual); let a = f64x2::from([f64::NAN, 5.0]); let b = f64x2::from([2.0, f64::NAN]); let expected = f64x2::from([2.0, 5.0]); let actual = a.min(b); assert_eq!(expected, actual); } #[test] fn impl_f64x2_is_nan() { let a = f64x2::from([0.0, f64::NAN]); let expected = [0, u64::MAX]; let actual: [u64; 2] = cast(a.is_nan()); assert_eq!(expected, actual); } #[test] fn impl_f64x2_is_finite() { let a = f64x2::from([f64::NAN, 1.0]); let expected = [0, u64::MAX]; let actual: [u64; 2] = cast(a.is_finite()); assert_eq!(expected, actual); // let a = f64x2::from([f64::INFINITY, f64::NEG_INFINITY]); let expected = [0, 0]; let actual: [u64; 2] = cast(a.is_finite()); assert_eq!(expected, actual); } #[test] fn impl_f64x2_round() { let a = f64x2::from([1.1, 2.5]); let expected = f64x2::from([1.0, 2.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f64x2::from([3.7, 4.0]); let expected = f64x2::from([4.0, 4.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f64x2::from([-1.1, -2.5]); let expected = f64x2::from([-1.0, -2.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f64x2::from([-3.7, -4.0]); let expected = f64x2::from([-4.0, -4.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f64x2::from([f64::INFINITY, f64::NEG_INFINITY]); let expected = f64x2::from([f64::INFINITY, f64::NEG_INFINITY]); let actual = a.round(); assert_eq!(expected, actual); // let a = f64x2::from([5.5, 5.0]); let expected = f64x2::from([6.0, 5.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f64x2::from(f64::NAN); let expected: [u64; 2] = [u64::MAX; 2]; let actual: [u64; 2] = cast(a.round().is_nan()); assert_eq!(expected, actual); // let a = f64x2::from(-0.0); let expected = a; let actual = a.round(); assert_eq!(expected, actual); } #[test] fn impl_f64x2_round_int() { for (f, i) in [ (1.0, 1), (1.1, 1), (-2.1, -2), (2.5, 2), (0.0, 0), (-0.0, 0), (f64::NAN, 0), (f64::INFINITY, i64::MAX), (f64::NEG_INFINITY, i64::MIN), ] .iter() .copied() { let a = f64x2::from(f); let expected = i64x2::from(i); let actual = a.round_int(); assert_eq!(expected, actual); } } #[test] fn impl_f64x2_mul_add() { let a = f64x2::from([2.0, 3.0]); let b = f64x2::from([4.0, 5.0]); let c = f64x2::from([1.0, 1.0]); let expected = f64x2::from([9.0, 16.0]); let actual = a.mul_add(b, c); assert_eq!(expected, actual); } #[test] fn impl_f64x2_mul_neg_add() { let a = f64x2::from([2.0, 3.0]); let b = f64x2::from([4.0, 5.0]); let c = f64x2::from([1.0, 1.0]); let expected = f64x2::from([-7.0, -14.0]); let actual = a.mul_neg_add(b, c); assert_eq!(expected, actual); } #[test] fn impl_f64x2_flip_signs() { let a = f64x2::from([1.0, 1.0]); let b = f64x2::from([2.0, -3.0]); let expected = f64x2::from([1.0, -1.0]); let actual = a.flip_signs(b); assert_eq!(expected, actual); // let a = f64x2::from([-1.0, -1.0]); let b = f64x2::from([4.0, -5.0]); let expected = f64x2::from([-1.0, 1.0]); let actual = a.flip_signs(b); assert_eq!(expected, actual); } #[test] fn impl_f64x2_copysign() { let a = f64x2::from([1.0, 1.0]); let b = f64x2::from([2.0, -3.0]); let expected = f64x2::from([1.0, -1.0]); let actual = a.copysign(b); assert_eq!(expected, actual); // let a = f64x2::from([-1.0, -1.0]); let b = f64x2::from([4.0, -5.0]); let expected = f64x2::from([1.0, -1.0]); let actual = a.copysign(b); assert_eq!(expected, actual); } #[test] fn impl_f64x2_sin_cos() { for x in -2500..=2500 { let base = (x * 4) as f64; let angles = [base, base + 1.0]; let (actual_sins, actual_coses) = f64x2::from(angles).sin_cos(); for i in 0..2 { let angle = angles[i]; let check = |name: &str, vals: f64x2, expected: f64| { let actual_arr: [f64; 2] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.00000006, "Wanted {name}({angle}) to be {expected} but got {actual}", name = name, angle = angle, expected = expected, actual = actual ); }; check("sin", actual_sins, angle.sin()); check("cos", actual_coses, angle.cos()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "sse")] #[test] fn impl_f64x2_asin_acos() { let inc = 1.0 / 2501.0 / 2.0; for x in -2500..=2500 { let base = (x * 2) as f64 * inc; let origs = [base, base + inc]; let (actual_asins, actual_acoses) = f64x2::from(origs).asin_acos(); for i in 0..2 { let orig = origs[i]; let check = |name: &str, vals: f64x2, expected: f64| { let actual_arr: [f64; 2] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.000000000000001, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("asin", actual_asins, orig.asin()); check("acos", actual_acoses, orig.acos()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "sse")] #[test] fn impl_f64x2_asin() { let inc = 1.0 / 2501.0 / 2.0; for x in -2500..=2500 { let base = (x * 2) as f64 * inc; let origs = [base, base + inc]; let actual_asins = f64x2::from(origs).asin(); for i in 0..2 { let orig = origs[i]; let check = |name: &str, vals: f64x2, expected: f64| { let actual_arr: [f64; 2] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.000000000000001, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("asin", actual_asins, orig.asin()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "sse")] #[test] fn impl_f64x2_acos() { let inc = 1.0 / 2501.0 / 2.0; for x in -2500..=2500 { let base = (x * 2) as f64 * inc; let origs = [base, base + inc]; let actual_acoses = f64x2::from(origs).acos(); for i in 0..2 { let orig = origs[i]; let check = |name: &str, vals: f64x2, expected: f64| { let actual_arr: [f64; 2] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.000000000000001, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("acos", actual_acoses, orig.acos()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "sse")] #[test] fn impl_f64x2_atan() { let inc = 1.0 / 2501.0 / 2.0; for x in -2500..=2500 { let base = (x * 2) as f64 * inc; let origs = [base, base + inc]; let actual_atans = f64x2::from(origs).atan(); for i in 0..2 { let orig = origs[i]; let check = |name: &str, vals: f64x2, expected: f64| { let actual_arr: [f64; 2] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.000000000000001, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("atan", actual_atans, orig.atan()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "sse")] #[test] fn impl_f64x2_atan2() { let inc_y = 1.0 / 51.0 / 2.0; let inc_x = 1.0 / 2501.0 / 2.0; for y in -50..=50 { let base_y = (y * 2) as f64 * inc_y; let origs_y = [base_y, base_y + inc_y]; let actual_y = f64x2::from(origs_y); for x in -2500..=2500 { let base_x = (x * 2) as f64 * inc_x; let origs_x = [base_x, base_x + inc_x]; let actual_x = f64x2::from(origs_x); let actual_atan2s = actual_y.atan2(actual_x); for i in 0..2 { let orig_y = origs_y[i]; let orig_x = origs_x[i]; let check = |name: &str, vals: f64x2, expected: f64| { let actual_arr: [f64; 2] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.000000000000001, "Wanted {name}({orig_y}, {orig_x}) to be {expected} but got {actual}", name = name, orig_y = orig_y, orig_x = orig_x, expected = expected, actual = actual ); }; check("atan2", actual_atan2s, orig_y.atan2(orig_x)); } } } } #[test] fn impl_f64x2_to_degrees() { let pi = core::f64::consts::PI; let a = f64x2::from([0.0, pi / 2.0]); let expected = f64x2::from([0.0, 90.0]); let actual = a.to_degrees(); assert_eq!(expected, actual); // let a = f64x2::from([pi, pi * 2.0]); let expected = f64x2::from([180.0, 360.0]); let actual = a.to_degrees(); assert_eq!(expected, actual); } #[test] fn impl_f64x2_to_radians() { let pi = core::f64::consts::PI; let a = f64x2::from([0.0, 90.0]); let expected = f64x2::from([0.0, pi / 2.0]); let actual = a.to_radians(); assert_eq!(expected, actual); // let a = f64x2::from([180.0, 360.0]); let expected = f64x2::from([pi, pi * 2.0]); let actual = a.to_radians(); assert_eq!(expected, actual); } #[test] fn impl_f64x2_sqrt() { for (f, e) in [ (f64::INFINITY, f64::INFINITY), (0.0, 0.0), (-0.0, -0.0), (4.0, 2.0), (9.0, 3.0), (16.0, 4.0), (25.0, 5.0), (5000.0 * 5000.0, 5000.0), ] .iter() .copied() { let expected = f64x2::from(e); let actual = f64x2::from(f).sqrt(); assert_eq!(expected, actual); } assert_eq!( cast::<_, i64x2>(f64x2::from(f64::NAN).sqrt().is_nan()), i64x2::from(-1) ); assert_eq!( cast::<_, i64x2>(f64x2::from(f64::NEG_INFINITY).sqrt().is_nan()), i64x2::from(-1) ); assert_eq!( cast::<_, i64x2>(f64x2::from(-1.0).sqrt().is_nan()), i64x2::from(-1) ); } #[test] fn test_f64x2_move_mask() { let a = f64x2::from([-1.0, 0.0]); let expected = 0b01; let actual = a.move_mask(); assert_eq!(expected, actual); // let a = f64x2::from([1.0, -0.0]); let expected = 0b10; let actual = a.move_mask(); assert_eq!(expected, actual); } #[test] fn impl_f64x2_exp() { for f in [(-2.0), (-1.0), (0.0), (1.0), (1.5), (2.0), (10.0)].iter().copied() { let expected = f64x2::from((f as f64).exp()); let actual = f64x2::from(f).exp(); let diff_from_std: [f64; 2] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000000000000001); } } #[test] fn test_f64x2_any() { let a = f64x2::from([-1.0, f64::NAN]).is_nan(); assert!(a.any()); // let a = f64x2::from([1.0, 0.0]).is_nan(); assert!(!a.any()); } #[test] fn test_f64x2_all() { let a = f64x2::from([f64::NAN, f64::NAN]).is_nan(); assert!(a.all()); // let a = f64x2::from([1.0, f64::NAN]).is_nan(); assert!(!a.all()); } #[test] fn test_f64x2_none() { let a = f64x2::from([1.0, 0.0]).is_nan(); assert!(a.none()); // let a = f64x2::from([1.0, f64::NAN]).is_nan(); assert!(!a.none()); } #[test] fn impl_f64x2_ln() { for f in [0.1f64, 0.5, 1.0, 2.718282, 10.0, 35.0, 1250.0].iter().copied() { let expected = f64x2::from((f as f64).ln()); let actual = f64x2::from(f).ln(); let diff_from_std: [f64; 2] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000000000001); } } #[test] fn impl_f64x2_pow_single() { for f in [0.1, 0.5, 1.0, 2.718282, 3.0, 4.0, 2.5, -1.0].iter().copied() { let expected = f64x2::splat(2.0 as f64).powf(f); let actual = f64x2::from(2.0_f64.powf(f)); let diff_from_std: [f64; 2] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000001); } } #[cfg(target_feature = "sse")] #[test] fn impl_f64x2_pow_nan() { for f in [3.4].iter().copied() { let expected: [f64; 2] = cast(f64x2::splat(-4.5 as f64).powf(f)); let actual = (-4.5_f64).powf(f); dbg!(&actual); dbg!(&expected); assert!(expected[0].is_nan()); assert!(actual.is_nan()); } } #[test] fn impl_f64x2_pow_multiple() { let p = f64x2::from([29.0, 0.1]); let f = f64x2::from([1.2, 2.0]); let res = f.pow_f64x2(p); let p: [f64; 2] = cast(p); let f: [f64; 2] = cast(f); let res: [f64; 2] = cast(res); for i in 0..p.len() { let expected = f[i].powf(p[i]); if !(expected.is_nan() && res[i].is_nan()) { assert!((expected - res[i]).abs() < 0.0001); } } let p = f64x2::from([2.718282, -0.2]); let f = f64x2::from([9.2, 6.1]); let res = f.pow_f64x2(p); let p: [f64; 2] = cast(p); let f: [f64; 2] = cast(f); let res: [f64; 2] = cast(res); for i in 0..p.len() { let expected = f[i].powf(p[i]); if !(expected.is_nan() && res[i].is_nan()) { assert!((expected - res[i]).abs() < 0.0001); } } let p = f64x2::from([-1.5, 3.4]); let f = f64x2::from([2.5, 4.5]); let res = f.pow_f64x2(p); let p: [f64; 2] = cast(p); let f: [f64; 2] = cast(f); let res: [f64; 2] = cast(res); for i in 0..p.len() { let expected = f[i].powf(p[i]); if !(expected.is_nan() && res[i].is_nan()) { dbg!(expected); dbg!(res[i]); assert!((expected - res[i]).abs() < 0.0001); } } } #[test] fn impl_f64x2_reduce_add() { let p = f64x2::splat(0.001); assert_eq!(p.reduce_add(), 0.002); } #[test] fn impl_f64x2_sum() { let mut p = Vec::with_capacity(250_000); for _ in 0..500_000 { p.push(f64x2::splat(0.001)); } let now = std::time::Instant::now(); let sum: f64 = p.iter().map(|x| x.reduce_add()).sum(); let duration = now.elapsed().as_micros(); println!("Time take {} {}us", sum, duration); let p = vec![0.001; 1_000_000]; let now = std::time::Instant::now(); let sum2: f64 = p.iter().sum(); let duration = now.elapsed().as_micros(); println!("Time take {} {}us", sum2, duration); } #[test] fn impl_f64x2_from_i32x4() { let i = i32x4::from([1, 2, 3, 4]); let f = f64x2::from([1.0, 2.0]); assert_eq!(f64x2::from_i32x4_lower2(i), f) } #[cfg(feature = "serde")] #[test] fn impl_f64x2_ser_de_roundtrip() { let serialized = bincode::serialize(&f64x2::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(f64x2::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_f64x4.rs000066400000000000000000000506311473735473700174640ustar00rootroot00000000000000use core::f64; use wide::*; use bytemuck::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] fn impl_debug_for_f64x4() { let expected = "(1.0, 2.0, 3.0, 4.0)"; let actual = format!("{:?}", f64x4::from([1.0, 2.0, 3.0, 4.0])); assert_eq!(expected, actual); let expected = "(1.000, 2.000, 3.000, 4.000)"; let actual = format!("{:.3?}", f64x4::from([1.0, 2.0, 3.0, 4.0])); assert_eq!(expected, actual); } #[test] fn impl_add_for_f64x4() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let b = f64x4::from([5.0, 6.0, 7.0, 8.0]); let expected = f64x4::from([6.0, 8.0, 10.0, 12.0]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_f64x4() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let b = f64x4::from([5.0, 7.0, 17.0, 1.0]); let expected = f64x4::from([-4.0, -5.0, -14.0, 3.0]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_f64x4() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let b = f64x4::from([5.0, 7.0, 17.0, 1.0]); let expected = f64x4::from([5.0, 14.0, 51.0, 4.0]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_div_for_f64x4() { let a = f64x4::from([4.0, 9.0, 10.0, 12.0]); let b = f64x4::from([2.0, 2.0, 5.0, -3.0]); let expected = f64x4::from([2.0, 4.5, 2.0, -4.0]); let actual = a / b; assert_eq!(expected, actual); } #[test] fn impl_sub_const_for_f64x4() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let expected = f64x4::from([-1.0, 0.0, 1.0, 2.0]); let actual = a - 2.0; assert_eq!(expected, actual); } #[test] fn impl_mul_const_for_f64x4() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let expected = f64x4::from([2.0, 4.0, 6.0, 8.0]); let actual = a * 2.0; assert_eq!(expected, actual); } #[test] fn impl_div_const_for_f64x4() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let expected = f64x4::from([0.5, 1.0, 1.5, 2.0]); let actual = a / 2.0; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_f64x4() { let a = f64x4::from([0.0, 0.0, 1.0, 1.0]); let b = f64x4::from([0.0, 1.0, 0.0, 1.0]); let expected = f64x4::from([0.0, 0.0, 0.0, 1.0]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_f64x4() { let a = f64x4::from([0.0, 0.0, 1.0, 1.0]); let b = f64x4::from([0.0, 1.0, 0.0, 1.0]); let expected = f64x4::from([0.0, 1.0, 1.0, 1.0]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_f64x4() { let a = f64x4::from([0.0, 0.0, 1.0, 1.0]); let b = f64x4::from([0.0, 1.0, 0.0, 1.0]); let expected = f64x4::from([0.0, 1.0, 1.0, 0.0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_f64x4_cmp_eq() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let b = f64x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i64; 4] = [0, -1, 0, 0]; let actual: [i64; 4] = cast(a.cmp_eq(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x4_cmp_ne() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let b = f64x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i64; 4] = [-1, 0, -1, -1]; let actual: [i64; 4] = cast(a.cmp_ne(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x4_cmp_ge() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let b = f64x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i64; 4] = [0, -1, -1, -1]; let actual: [i64; 4] = cast(a.cmp_ge(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x4_cmp_gt() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let b = f64x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i64; 4] = [0, 0, -1, -1]; let actual: [i64; 4] = cast(a.cmp_gt(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x4_cmp_le() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let b = f64x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i64; 4] = [-1, -1, 0, 0]; let actual: [i64; 4] = cast(a.cmp_le(b)); assert_eq!(expected, actual); } #[test] fn impl_f64x4_cmp_lt() { let a = f64x4::from([1.0, 2.0, 3.0, 4.0]); let b = f64x4::from([2.0, 2.0, 2.0, 2.0]); let expected: [i64; 4] = [-1, 0, 0, 0]; let actual: [i64; 4] = cast(a.cmp_lt(b)); assert_eq!(expected, actual); let expected: [i64; 4] = [0, 0, 0, 0]; let actual: [i64; 4] = cast(a.cmp_lt(a)); assert_eq!(expected, actual); } #[test] fn impl_f64x4_blend() { let use_t: f64 = f64::from_bits(u64::MAX); let t = f64x4::from([1.0, 2.0, 3.0, 4.0]); let f = f64x4::from([5.0, 6.0, 7.0, 8.0]); let mask = f64x4::from([use_t, 0.0, use_t, 0.0]); let expected = f64x4::from([1.0, 6.0, 3.0, 8.0]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_f64x4_abs() { let a = f64x4::from([-1.0, 2.0, -3.5, f64::NEG_INFINITY]); let expected = f64x4::from([1.0, 2.0, 3.5, f64::INFINITY]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_f64x4_floor() { let a = f64x4::from([-1.1, 60.9, 1.1, f64::INFINITY]); let expected = f64x4::from([-2.0, 60.0, 1.0, f64::INFINITY]); let actual = a.floor(); assert_eq!(expected, actual); } #[test] fn impl_f64x4_ceil() { let a = f64x4::from([-1.1, 60.9, 1.1, f64::NEG_INFINITY]); let expected = f64x4::from([-1.0, 61.0, 2.0, f64::NEG_INFINITY]); let actual = a.ceil(); assert_eq!(expected, actual); } #[test] fn impl_f64x4_fast_max() { let a = f64x4::from([1.0, 5.0, 3.0, -0.0]); let b = f64x4::from([2.0, f64::NEG_INFINITY, f64::INFINITY, 0.0]); let expected = f64x4::from([2.0, 5.0, f64::INFINITY, 0.0]); let actual = a.fast_max(b); assert_eq!(expected, actual); } #[test] fn impl_f64x4_max() { let a = f64x4::from([1.0, 5.0, 3.0, -0.0]); let b = f64x4::from([2.0, f64::NEG_INFINITY, f64::INFINITY, 0.0]); let expected = f64x4::from([2.0, 5.0, f64::INFINITY, 0.0]); let actual = a.max(b); assert_eq!(expected, actual); let a = f64x4::from([f64::NAN, 5.0, f64::INFINITY, f64::NAN]); let b = f64x4::from([2.0, f64::NAN, f64::NAN, f64::INFINITY]); let expected = f64x4::from([2.0, 5.0, f64::INFINITY, f64::INFINITY]); let actual = a.max(b); assert_eq!(expected, actual); } #[test] fn impl_f64x4_fast_min() { let a = f64x4::from([1.0, 5.0, 3.0, -0.0]); let b = f64x4::from([2.0, f64::NEG_INFINITY, f64::INFINITY, 0.0]); let expected = f64x4::from([1.0, f64::NEG_INFINITY, 3.0, -0.0]); let actual = a.fast_min(b); assert_eq!(expected, actual); } #[test] fn impl_f64x4_min() { let a = f64x4::from([1.0, 5.0, 3.0, -0.0]); let b = f64x4::from([2.0, f64::NEG_INFINITY, f64::INFINITY, 0.0]); let expected = f64x4::from([1.0, f64::NEG_INFINITY, 3.0, -0.0]); let actual = a.min(b); assert_eq!(expected, actual); let a = f64x4::from([f64::NAN, 5.0, f64::INFINITY, f64::NAN]); let b = f64x4::from([2.0, f64::NAN, f64::NAN, f64::INFINITY]); let expected = f64x4::from([2.0, 5.0, f64::INFINITY, f64::INFINITY]); let actual = a.min(b); assert_eq!(expected, actual); } #[test] fn impl_f64x4_is_nan() { let a = f64x4::from([0.0, f64::NAN, f64::NAN, 0.0]); let expected = [0, u64::MAX, u64::MAX, 0]; let actual: [u64; 4] = cast(a.is_nan()); assert_eq!(expected, actual); } #[test] fn impl_f64x4_is_finite() { let a = f64x4::from([f64::NAN, 1.0, f64::INFINITY, f64::NEG_INFINITY]); let expected = [0, u64::MAX, 0, 0]; let actual: [u64; 4] = cast(a.is_finite()); assert_eq!(expected, actual); } #[test] fn impl_f64x4_round() { let a = f64x4::from([1.1, 2.5, 3.7, 4.0]); let expected = f64x4::from([1.0, 2.0, 4.0, 4.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f64x4::from([-1.1, -2.5, -3.7, -4.0]); let expected = f64x4::from([-1.0, -2.0, -4.0, -4.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f64x4::from([f64::INFINITY, f64::NEG_INFINITY, 5.5, 5.0]); let expected = f64x4::from([f64::INFINITY, f64::NEG_INFINITY, 6.0, 5.0]); let actual = a.round(); assert_eq!(expected, actual); // let a = f64x4::from(f64::NAN); let expected: [u64; 4] = [u64::MAX; 4]; let actual: [u64; 4] = cast(a.round().is_nan()); assert_eq!(expected, actual); // let a = f64x4::from(-0.0); let expected = a; let actual = a.round(); assert_eq!(expected, actual); } #[test] fn impl_f64x4_round_int() { for (f, i) in [ (1.0, 1i64), (1.1, 1), (-2.1, -2), (2.5, 2), (0.0, 0), (-0.0, 0), (f64::NAN, 0), (f64::INFINITY, i64::MAX), (f64::NEG_INFINITY, i64::MIN), ] .iter() .copied() { let a = f64x4::from(f); let expected = i64x4::from(i); let actual = a.round_int(); assert_eq!(expected, actual); } } #[test] fn impl_f64x4_mul_add() { let a = f64x4::from([2.0, 3.0, 4.0, 5.0]); let b = f64x4::from([4.0, 5.0, 6.0, 7.0]); let c = f64x4::from([1.0, 1.0, 1.0, 1.0]); let expected = f64x4::from([9.0, 16.0, 25.0, 36.0]); let actual = a.mul_add(b, c); assert_eq!(expected, actual); } #[test] fn impl_f64x4_mul_neg_add() { let a = f64x4::from([2.0, 3.0, 4.0, 5.0]); let b = f64x4::from([4.0, 5.0, 6.0, 7.0]); let c = f64x4::from([1.0, 1.0, 1.0, 1.0]); let expected = f64x4::from([-7.0, -14.0, -23.0, -34.0]); let actual = a.mul_neg_add(b, c); assert_eq!(expected, actual); } #[test] fn impl_f64x4_flip_signs() { let a = f64x4::from([1.0, 1.0, -1.0, -1.0]); let b = f64x4::from([2.0, -3.0, 4.0, -5.0]); let expected = f64x4::from([1.0, -1.0, -1.0, 1.0]); let actual = a.flip_signs(b); assert_eq!(expected, actual); } #[test] fn impl_f64x4_copysign() { let a = f64x4::from([1.0, 1.0, -1.0, -1.0]); let b = f64x4::from([2.0, -3.0, 4.0, -5.0]); let expected = f64x4::from([1.0, -1.0, 1.0, -1.0]); let actual = a.copysign(b); assert_eq!(expected, actual); } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "avx")] #[test] fn impl_f64x4_asin_acos() { let inc = 1.0 / 2501.0 / 4.0; for x in -2500..=2500 { let base = (x * 4) as f64 * inc; let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; let (actual_asins, actual_acoses) = f64x4::from(origs).asin_acos(); for i in 0..4 { let orig = origs[i]; let check = |name: &str, vals: f64x4, expected: f64| { let actual_arr: [f64; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("asin", actual_asins, orig.asin()); check("acos", actual_acoses, orig.acos()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "avx")] #[test] fn impl_f64x4_asin() { let inc = 1.0 / 2501.0 / 4.0; for x in -2500..=2500 { let base = (x * 4) as f64 * inc; let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; let actual_asins = f64x4::from(origs).asin(); for i in 0..4 { let orig = origs[i]; let check = |name: &str, vals: f64x4, expected: f64| { let actual_arr: [f64; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("asin", actual_asins, orig.asin()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "avx")] #[test] fn impl_f64x4_acos() { let inc = 1.0 / 2501.0 / 4.0; for x in -2500..=2500 { let base = (x * 4) as f64 * inc; let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; let actual_acoses = f64x4::from(origs).acos(); for i in 0..4 { let orig = origs[i]; let check = |name: &str, vals: f64x4, expected: f64| { let actual_arr: [f64; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.0000006, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("acos", actual_acoses, orig.acos()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "avx")] #[test] fn impl_f64x4_atan() { let inc = 1.0 / 2501.0 / 4.0; for x in -2500..=2500 { let base = (x * 4) as f64 * inc; let origs = [base, base + inc, base + 2.0 * inc, base + 3.0 * inc]; let actual_atans = f64x4::from(origs).atan(); for i in 0..4 { let orig = origs[i]; let check = |name: &str, vals: f64x4, expected: f64| { let actual_arr: [f64; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.000000000000001, "Wanted {name}({orig}) to be {expected} but got {actual}", name = name, orig = orig, expected = expected, actual = actual ); }; check("atan", actual_atans, orig.atan()); } } } // FIXME: remove cfg requirement once masks as their own types are implemented #[cfg(target_feature = "avx")] #[test] fn impl_f64x4_atan2() { let inc_y = 1.0 / 51.0 / 4.0; let inc_x = 1.0 / 2501.0 / 4.0; for y in -50..=50 { let base_y = (y * 4) as f64 * inc_y; let origs_y = [base_y, base_y + inc_y, base_y + 2.0 * inc_y, base_y + 3.0 * inc_y]; let actual_y = f64x4::from(origs_y); for x in -2500..=2500 { let base_x = (x * 4) as f64 * inc_x; let origs_x = [base_x, base_x + inc_x, base_x + 2.0 * inc_x, base_x + 3.0 * inc_x]; let actual_x = f64x4::from(origs_x); let actual_atan2s = actual_y.atan2(actual_x); for i in 0..4 { let orig_y = origs_y[i]; let orig_x = origs_x[i]; let check = |name: &str, vals: f64x4, expected: f64| { let actual_arr: [f64; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.000000000000001, "Wanted {name}({orig_y}, {orig_x}) to be {expected} but got {actual}", name = name, orig_y = orig_y, orig_x = orig_x, expected = expected, actual = actual ); }; check("atan2", actual_atan2s, orig_y.atan2(orig_x)); } } } } #[test] fn impl_f64x4_sin_cos() { for x in -2500..=2500 { let base = (x * 4) as f64; let angles = [base, base + 1.0, base + 2.0, base + 3.0]; let (actual_sins, actual_coses) = f64x4::from(angles).sin_cos(); for i in 0..4 { let angle = angles[i]; let check = |name: &str, vals: f64x4, expected: f64| { let actual_arr: [f64; 4] = cast(vals); let actual = actual_arr[i]; assert!( (actual - expected).abs() < 0.00000006, "Wanted {name}({angle}) to be {expected} but got {actual}", name = name, angle = angle, expected = expected, actual = actual ); }; check("sin", actual_sins, angle.sin()); check("cos", actual_coses, angle.cos()); } } } #[test] fn impl_f64x4_to_degrees() { let pi = core::f64::consts::PI; let a = f64x4::from([0.0, pi / 2.0, pi, 2.0 * pi]); let expected = f64x4::from([0.0, 90.0, 180.0, 360.0]); let actual = a.to_degrees(); assert_eq!(expected, actual); } #[test] fn impl_f64x4_to_radians() { let pi = core::f64::consts::PI; let a = f64x4::from([0.0, 90.0, 180.0, 360.0]); let expected = f64x4::from([0.0, pi / 2.0, pi, 2.0 * pi]); let actual = a.to_radians(); assert_eq!(expected, actual); } #[test] fn impl_f64x4_sqrt() { for (f, e) in [ (f64::INFINITY, f64::INFINITY), (0.0, 0.0), (-0.0, -0.0), (4.0, 2.0), (9.0, 3.0), (16.0, 4.0), (25.0, 5.0), (5000.0 * 5000.0, 5000.0), ] .iter() .copied() { let expected = f64x4::from(e); let actual = f64x4::from(f).sqrt(); assert_eq!(expected, actual); } assert_eq!( cast::<_, i64x4>(f64x4::from(f64::NAN).sqrt().is_nan()), i64x4::from(-1) ); assert_eq!( cast::<_, i64x4>(f64x4::from(f64::NEG_INFINITY).sqrt().is_nan()), i64x4::from(-1) ); assert_eq!( cast::<_, i64x4>(f64x4::from(-1.0).sqrt().is_nan()), i64x4::from(-1) ); } #[test] fn impl_f64x4_exp() { for f in [(-2.0), (-1.0), (0.0), (1.0), (1.5), (2.0), (10.0)].iter().copied() { let expected = f64x4::from((f as f64).exp()); let actual = f64x4::from(f).exp(); let diff_from_std: [f64; 4] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000000000000001); } } #[test] fn test_f64x4_move_mask() { let a = f64x4::from([-1.0, 0.0, -2.0, -3.0]); let expected = 0b1101; let actual = a.move_mask(); assert_eq!(expected, actual); // let a = f64x4::from([1.0, 0.0, 2.0, -3.0]); let expected = 0b1000; let actual = a.move_mask(); assert_eq!(expected, actual); } #[test] fn test_f64x4_any() { let a = f64x4::from([-1.0, 0.0, -2.0, f64::NAN]).is_nan(); assert!(a.any()); // let a = f64x4::from([1.0, 0.0, 2.0, 3.0]).is_nan(); assert!(!a.any()); } #[test] fn test_f64x4_all() { let a = f64x4::from([f64::NAN, f64::NAN, f64::NAN, f64::NAN]).is_nan(); assert!(a.all()); // let a = f64x4::from([1.0, -0.0, 2.0, f64::NAN]).is_nan(); assert!(!a.all()); } #[test] fn test_f64x4_none() { let a = f64x4::from([1.0, 0.0, 2.0, 3.0]).is_nan(); assert!(a.none()); // let a = f64x4::from([1.0, -0.0, 2.0, f64::NAN]).is_nan(); assert!(!a.none()); } #[test] fn impl_f64x4_ln() { if cfg!(target_feature = "sse") { for f in [0.1, 0.5, 1.0, 2.718282, 10.0, 35.0, 1250.0].iter().copied() { let expected = f64x4::from((f as f64).ln()); let actual = f64x4::from(f).ln(); let diff_from_std: [f64; 4] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.00000000001); } } } #[test] fn impl_f64x4_pow_single() { for f in [0.1, 0.5, 1.0, 2.718282, 3.0, 4.0, 2.5, -1.0].iter().copied() { let expected = f64x4::splat(2.0 as f64).powf(f); let actual = f64x4::from(2.0_f64.powf(f)); let diff_from_std: [f64; 4] = cast((actual - expected).abs()); assert!(diff_from_std[0] < 0.000001); } } #[cfg(target_feature = "sse")] #[test] // NOTE this fails due the signbit not working with the non-sse blend // it only affects the case where there is a nan result fn impl_f64x4_pow_nan() { for f in [3.4].iter().copied() { let expected: [f64; 4] = cast(f64x4::splat(-4.5_f64).powf(f)); let actual = (-4.5_f64).powf(f); assert!(expected[0].is_nan()); assert!(actual.is_nan()); } } #[test] fn impl_f64x4_pow_multiple() { let p = f64x4::from([29.0, 0.1, 0.5, 1.0]); let f = f64x4::from([1.2, 2.0, 3.0, 1.5]); let res = f.pow_f64x4(p); let p: [f64; 4] = cast(p); let f: [f64; 4] = cast(f); let res: [f64; 4] = cast(res); for i in 0..p.len() { let expected = f[i].powf(p[i]); if expected.is_nan() && res[i].is_nan() { assert!(true); continue; } if !(expected.is_nan() && res[i].is_nan()) { assert!((expected - res[i]).abs() < 0.0001); } } let p = f64x4::from([2.718282, -0.2, -1.5, 3.4]); let f = f64x4::from([9.2, 6.1, 2.5, 4.5]); let res = f.pow_f64x4(p); let p: [f64; 4] = cast(p); let f: [f64; 4] = cast(f); let res: [f64; 4] = cast(res); for i in 0..p.len() { let expected = f[i].powf(p[i]); if !(expected.is_nan() && res[i].is_nan()) { assert!((expected - res[i]).abs() < 0.0001); } } } #[test] fn impl_f64x4_reduce_add() { let p = f64x4::splat(0.001); assert_eq!(p.reduce_add(), 0.004); } #[test] fn impl_f64x4_sum() { let mut p = Vec::with_capacity(250_000); for _ in 0..250_000 { p.push(f64x4::splat(0.001)); } let now = std::time::Instant::now(); let sum: f64 = p.iter().map(|x| x.reduce_add()).sum(); let duration = now.elapsed().as_micros(); println!("Time take {} {}us", sum, duration); let p = vec![0.001; 1_000_000]; let now = std::time::Instant::now(); let sum2: f64 = p.iter().sum(); let duration = now.elapsed().as_micros(); println!("Time take {} {}us", sum2, duration); } #[test] fn impl_f64x4_from_i32x4() { let i = i32x4::from([1, 2, 3, 4]); let f = f64x4::from([1.0, 2.0, 3.0, 4.0]); assert_eq!(f64x4::from(i), f); assert_eq!(f64x4::from_i32x4(i), f); } #[cfg(feature = "serde")] #[test] fn impl_f64x4_ser_de_roundtrip() { let serialized = bincode::serialize(&f64x4::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(f64x4::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_i16x16.rs000066400000000000000000000301471473735473700175470ustar00rootroot00000000000000use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] fn impl_add_for_i16x16() { let a = i16x16::from([ 1, 2, i16::MAX - 1, i16::MAX - 1, 15, 20, 5000, 2990, 1, 2, i16::MAX - 1, i16::MAX - 1, 15, 20, 5000, 2990, ]); let b = i16x16::from([ 17, 18, 1, 2, 20, 5, 900, 900, 17, 18, 1, 2, 20, 5, 900, 900, ]); let expected = i16x16::from([ 18, 20, i16::MAX, i16::MIN, 35, 25, 5900, 3890, 18, 20, i16::MAX, i16::MIN, 35, 25, 5900, 3890, ]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_i16x16() { let a = i16x16::from([ 1, 2, i16::MIN + 1, i16::MIN, 15, 20, 5000, 2990, 1, 2, i16::MIN + 1, i16::MIN, 15, 20, 5000, 2990, ]); let b = i16x16::from([ 17, -18, 1, 1, 20, 5, 900, 900, 17, -18, 1, 1, 20, 5, 900, 900, ]); let expected = i16x16::from([ -16, 20, i16::MIN, i16::MAX, -5, 15, 4100, 2090, -16, 20, i16::MIN, i16::MAX, -5, 15, 4100, 2090, ]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_saturating_add_for_i16x16() { let a = i16x16::from([ 1, 2, i16::MAX - 1, i16::MAX - 1, 15, 20, 5000, 2990, 1, 2, i16::MAX - 1, i16::MAX - 1, 15, 20, 5000, 2990, ]); let b = i16x16::from([ 17, 18, 1, 2, 20, 5, 900, 900, 17, 18, 1, 2, 20, 5, 900, 900, ]); let expected = i16x16::from([ 18, 20, i16::MAX, i16::MAX, 35, 25, 5900, 3890, 18, 20, i16::MAX, i16::MAX, 35, 25, 5900, 3890, ]); let actual = a.saturating_add(b); assert_eq!(expected, actual); } #[test] fn impl_saturating_sub_for_i16x16() { let a = i16x16::from([ 1, 2, i16::MIN + 1, i16::MIN, 15, 20, 5000, 2990, 1, 2, i16::MIN + 1, i16::MIN, 15, 20, 5000, 2990, ]); let b = i16x16::from([ 17, -18, 1, 1, 20, 5, 900, 900, 17, -18, 1, 1, 20, 5, 900, 900, ]); let expected = i16x16::from([ -16, 20, i16::MIN, i16::MIN, -5, 15, 4100, 2090, -16, 20, i16::MIN, i16::MIN, -5, 15, 4100, 2090, ]); let actual = a.saturating_sub(b); assert_eq!(expected, actual); } #[test] fn impl_mul_scale_i16x16() { let a = i16x16::from([ 0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, ]); let b = i16x16::from([ 0, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, ]); let actual = a.mul_scale_round(b); let expected = i16x16::from([ 0, 3, 6, 10, 15, 20, 26, 32, 39, 47, 55, 64, 73, 83, 94, 105, ]); assert_eq!(expected, actual); } #[test] fn impl_mul_scale_n_i16x16() { let a = i16x16::from([ 0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, ]); let actual = a.mul_scale_round_n(16400); // slightly higher than 0.5 to test rounding let expected = i16x16::from([ 0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 551, 601, 651, 701, 751, ]); assert_eq!(expected, actual); } #[test] fn impl_mul_for_i16x16() { let a = i16x16::from([ 1, 2, i16::MIN + 1, i16::MIN, 2, 3, 4, 5, 1, 2, i16::MIN + 1, i16::MIN, 2, 3, 4, 5, ]); let b = i16x16::from([17, -18, 1, 1, -1, -2, -6, 3, 17, -18, 1, 1, -1, -2, -6, 3]); let expected = i16x16::from([ 17, -36, i16::MIN + 1, i16::MIN, -2, -6, -24, 15, 17, -36, i16::MIN + 1, i16::MIN, -2, -6, -24, 15, ]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_i16x16() { let a = i16x16::from([0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1]); let b = i16x16::from([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]); let expected = i16x16::from([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_i16x16() { let a = i16x16::from([0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1]); let b = i16x16::from([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]); let expected = i16x16::from([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_i16x16() { let a = i16x16::from([0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1]); let b = i16x16::from([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]); let expected = i16x16::from([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_shl_for_i16x16() { let a = i16x16::from([ 1, 2, i16::MAX - 1, i16::MAX - 1, 128, 255, 590, 5667, 1, 2, i16::MAX - 1, i16::MAX - 1, 128, 255, 590, 5667, ]); let b = 2; let expected = i16x16::from([ 1 << 2, 2 << 2, (i16::MAX - 1) << 2, (i16::MAX - 1) << 2, 128 << 2, 255 << 2, 590 << 2, 5667 << 2, 1 << 2, 2 << 2, (i16::MAX - 1) << 2, (i16::MAX - 1) << 2, 128 << 2, 255 << 2, 590 << 2, 5667 << 2, ]); let actual = a << b; assert_eq!(expected, actual); } #[test] fn impl_shr_for_i16x16() { let a = i16x16::from([ 1, 2, i16::MAX - 1, i16::MAX - 1, 128, 255, 590, 5667, 1, 2, i16::MAX - 1, i16::MAX - 1, 128, 255, 590, 5667, ]); let b = 2; let expected = i16x16::from([ 1 >> 2, 2 >> 2, (i16::MAX - 1) >> 2, (i16::MAX - 1) >> 2, 128 >> 2, 255 >> 2, 590 >> 2, 5667 >> 2, 1 >> 2, 2 >> 2, (i16::MAX - 1) >> 2, (i16::MAX - 1) >> 2, 128 >> 2, 255 >> 2, 590 >> 2, 5667 >> 2, ]); let actual = a >> b; assert_eq!(expected, actual); } #[test] fn impl_i16x16_cmp_eq() { let a = i16x16::from([1, 2, 3, 4, 2, 1, 8, 2, 1, 2, 3, 4, 2, 1, 8, 2]); let b = i16x16::from([2_i16; 16]); let expected = i16x16::from([0, -1, 0, 0, -1, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, -1]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_i16x16_cmp_gt() { let a = i16x16::from([1, 2, 9, 4, 1, 2, 8, 10, 1, 2, 9, 4, 1, 2, 8, 10]); let b = i16x16::from([5_i16; 16]); let expected = i16x16::from([0, 0, -1, 0, 0, 0, -1, -1, 0, 0, -1, 0, 0, 0, -1, -1]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); } #[test] fn impl_i16x16_cmp_lt() { let a = i16x16::from([1, 2, 9, 4, 1, 2, 8, 10, 1, 2, 9, 4, 1, 2, 8, 10]); let b = i16x16::from([5_i16; 16]); let expected = i16x16::from([-1, -1, 0, -1, -1, -1, 0, 0, -1, -1, 0, -1, -1, -1, 0, 0]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); let expected = i16x16::from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let actual = a.cmp_lt(a); assert_eq!(expected, actual); } #[test] fn impl_i16x16_blend() { let use_t: i16 = -1; let t = i16x16::from([1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]); let f = i16x16::from([ 17, 18, 19, 20, 25, 30, 50, 90, 17, 18, 19, 20, 25, 30, 50, 90, ]); let mask = i16x16::from([ use_t, 0, use_t, 0, 0, 0, 0, use_t, use_t, 0, use_t, 0, 0, 0, 0, use_t, ]); let expected = i16x16::from([1, 18, 3, 20, 25, 30, 50, 8, 1, 18, 3, 20, 25, 30, 50, 8]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_i16x16_abs() { let a = i16x16::from([ -1, 2, -3, i16::MIN, 6, -15, -19, 9, -1, 2, -3, i16::MIN, 6, -15, -19, 9, ]); let expected = i16x16::from([ 1, 2, 3, i16::MIN, 6, 15, 19, 9, 1, 2, 3, i16::MIN, 6, 15, 19, 9, ]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_i16x16_max() { let a = i16x16::from([ 1, 2, i16::MIN + 1, i16::MIN, 6, -8, 12, 9, 1, 2, i16::MIN + 1, i16::MIN, 6, -8, 12, 9, ]); let b = i16x16::from([ 17, -18, 1, 1, 19, -5, -1, -9, 17, -18, 1, 1, 19, -5, -1, -9, ]); let expected = i16x16::from([17, 2, 1, 1, 19, -5, 12, 9, 17, 2, 1, 1, 19, -5, 12, 9]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i16x16, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_i16x16_min() { let a = i16x16::from([ 1, 2, i16::MIN + 1, i16::MIN, 6, -8, 12, 9, 1, 2, i16::MIN + 1, i16::MIN, 6, -8, 12, 9, ]); let b = i16x16::from([ 17, -18, 1, 1, 19, -5, -1, -9, 17, -18, 1, 1, 19, -5, -1, -9, ]); let expected = i16x16::from([ 1, -18, i16::MIN + 1, i16::MIN, 6, -8, -1, -9, 1, -18, i16::MIN + 1, i16::MIN, 6, -8, -1, -9, ]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i16x16, b| a.min(b), |a, b| a.min(b)); } #[test] fn impl_from_i8x16() { let a = i8x16::from([ 10, 2, -3, 4, 5, -6, 7, 8, 9, 7, i8::MAX, 12, 13, 6, 55, i8::MIN, ]); let actual = i16x16::from_i8x16(a); let expected = i16x16::from([ 10, 2, -3, 4, 5, -6, 7, 8, 9, 7, i8::MAX as i16, 12, 13, 6, 55, i8::MIN as i16, ]); assert_eq!(expected, actual); } #[test] fn test_i16x16_move_mask() { let indexes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; for x in 0..256 { // multiply by prime number to mix bits a bit let i = x * 251; let a = i16x16::from(indexes.map(|x| if i & (1 << x) != 0 { -1 } else { 0 })); assert_eq!(a.move_mask(), i); } let a = i16x16::from([-1, 0, -2, -3, -1, 0, -2, -3, -1, 0, -1, 0, -1, 0, -1, 0]); let expected = 0b0101010111011101; let actual = a.move_mask(); assert_eq!(expected, actual); } #[test] fn test_i16x16_any() { let a = i16x16::from([0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert!(a.any()); let a = i16x16::from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0]); assert!(a.any()); // let a = i16x16::from([0; 16]); assert!(!a.any()); } #[test] fn test_i16x16_all() { let a = i16x16::from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0]); assert!(!a.all()); // let a = i16x16::from([-1; 16]); assert!(a.all()); } #[test] fn test_i16x16_none() { let a = i16x16::from([0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert!(!a.none()); let a = i16x16::from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0]); assert!(!a.none()); // let a = i16x16::from([0; 16]); assert!(a.none()); } #[test] fn impl_i16x16_reduce_add() { let p = i16x16::from([1, 2, 3, 4, 5, 6, 7, 9, 10, 20, 30, 40, 50, 60, 70, 90]); assert_eq!(p.reduce_add(), 407); } #[test] fn impl_dot_for_i16x16() { let a = i16x16::from([ 1, 2, 3, 4, 5, 6, i16::MIN + 1, i16::MIN, 10, 20, 30, 40, 50, 60, i16::MAX - 1, i16::MAX, ]); let b = i16x16::from([ 17, -18, 190, -20, 21, -22, 3, 2, 170, -180, 1900, -200, 210, -220, 30, 20, ]); let expected = i32x8::from([-19, 490, -27, -163837, -1900, 49000, -2700, 1638320]); let actual = a.dot(b); assert_eq!(expected, actual); } #[test] fn impl_i16x16_reduce_min() { for i in 0..8 { let mut v = [i16::MAX; 16]; v[i] = i16::MIN; let p = i16x16::from(v); assert_eq!(p.reduce_min(), i16::MIN); } } #[test] fn impl_i16x16_reduce_max() { for i in 0..8 { let mut v = [i16::MIN; 16]; v[i] = i16::MAX; let p = i16x16::from(v); assert_eq!(p.reduce_min(), i16::MIN); } } #[cfg(feature = "serde")] #[test] fn impl_i16x16_ser_de_roundtrip() { let serialized = bincode::serialize(&i16x16::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(i16x16::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_i16x8.rs000066400000000000000000000260171473735473700174710ustar00rootroot00000000000000use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_add_for_i16x8() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MAX - 1, i16::MAX - 1]); let b = i16x8::from([17, 18, 19, 20, 21, 22, 1, 2]); let expected = i16x8::from([18, 20, 22, 24, 26, 28, i16::MAX, i16::MIN]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_i16x8() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN + 1, i16::MIN]); let b = i16x8::from([17, -18, 190, -20, 21, -22, 1, 1]); let expected = i16x8::from([-16, 20, -187, 24, -16, 28, i16::MIN, i16::MAX]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_add_saturating_for_i16x8() { let a = i16x8::from([i16::MAX, i16::MIN, 3, 4, -1, -2, -3, -4]); let b = i16x8::from([i16::MAX, i16::MIN, 7, 8, -15, -26, -37, 48]); let expected = i16x8::from([i16::MAX, i16::MIN, 10, 12, -16, -28, -40, 44]); let actual = a.saturating_add(b); assert_eq!(expected, actual); } #[test] fn impl_mul_scale_i16x8() { let a = i16x8::from([100, 200, 300, 400, 500, -600, 700, -800]); let b = i16x8::from([900, 1000, 1100, 1200, 1300, -1400, -1500, 1600]); let actual = a.mul_scale_round(b); let expected = i16x8::from([3, 6, 10, 15, 20, 26, -32, -39]); assert_eq!(expected, actual); } #[test] fn impl_mul_scale_n_i16x8() { let a = i16x8::from([100, 200, 300, 400, 500, -600, 700, -800]); let actual = a.mul_scale_round_n(0x4000); let expected = i16x8::from([50, 100, 150, 200, 250, -300, 350, -400]); assert_eq!(expected, actual); } #[test] fn impl_sub_saturating_for_i16x8() { let a = i16x8::from([1, 2, 3, 4, 5, i16::MIN, i16::MIN + 1, i16::MAX]); let b = i16x8::from([17, -18, 190, -20, 21, -1, 1, -1]); let expected = i16x8::from([-16, 20, -187, 24, -16, i16::MIN + 1, i16::MIN, i16::MAX]); let actual = a.saturating_sub(b); assert_eq!(expected, actual); } #[test] fn impl_mul_for_i16x8() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN + 1, i16::MIN]); let b = i16x8::from([17, -18, 190, -20, 21, -22, 1, 1]); let expected = i16x8::from([17, -36, 570, -80, 105, -132, i16::MIN + 1, i16::MIN]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_transpose_for_i16x8() { let a = [ i16x8::new([0, 1, 2, 3, 4, 5, 6, 7]), i16x8::new([8, 9, 10, 11, 12, 13, 14, 15]), i16x8::new([16, 17, 18, 19, 20, 21, 22, 23]), i16x8::new([24, 25, 26, 27, 28, 29, 30, 31]), i16x8::new([32, 33, 34, 35, 36, 37, 38, 39]), i16x8::new([40, 41, 42, 43, 44, 45, 46, 47]), i16x8::new([48, 49, 50, 51, 52, 53, 54, 55]), i16x8::new([5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300]), ]; let result = i16x8::transpose(a); let expected = [ i16x8::new([0, 8, 16, 24, 32, 40, 48, 5600]), i16x8::new([1, 9, 17, 25, 33, 41, 49, 5700]), i16x8::new([2, 10, 18, 26, 34, 42, 50, 5800]), i16x8::new([3, 11, 19, 27, 35, 43, 51, 5900]), i16x8::new([4, 12, 20, 28, 36, 44, 52, 6000]), i16x8::new([5, 13, 21, 29, 37, 45, 53, 6100]), i16x8::new([6, 14, 22, 30, 38, 46, 54, 6200]), i16x8::new([7, 15, 23, 31, 39, 47, 55, 6300]), ]; assert_eq!(result, expected); } #[test] fn impl_bitand_for_i16x8() { let a = i16x8::from([0, 0, 1, 1, 0, 0, 1, 1]); let b = i16x8::from([0, 1, 0, 1, 0, 1, 0, 1]); let expected = i16x8::from([0, 0, 0, 1, 0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_i16x8() { let a = i16x8::from([0, 0, 1, 1, 0, 0, 1, 1]); let b = i16x8::from([0, 1, 0, 1, 0, 1, 0, 1]); let expected = i16x8::from([0, 1, 1, 1, 0, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_i16x8() { let a = i16x8::from([0, 0, 1, 1, 0, 0, 1, 1]); let b = i16x8::from([0, 1, 0, 1, 0, 1, 0, 1]); let expected = i16x8::from([0, 1, 1, 0, 0, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_shl_for_i16x8() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN + 1, i16::MIN]); let b = 2; let expected = i16x8::from([ 1 << 2, 2 << 2, 3 << 2, 4 << 2, 5 << 2, 6 << 2, (i16::MIN + 1) << 2, i16::MIN << 2, ]); let actual = a << b; assert_eq!(expected, actual); } #[test] fn impl_shr_for_i16x8() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN + 1, i16::MIN]); let b = 2; let expected = i16x8::from([ 1 >> 2, 2 >> 2, 3 >> 2, 4 >> 2, 5 >> 2, 6 >> 2, (i16::MIN + 1) >> 2, i16::MIN >> 2, ]); let actual = a >> b; assert_eq!(expected, actual); } #[test] fn impl_i16x8_cmp_eq() { let a = i16x8::from([1, 2, 3, 4, 1, 2, 3, 4]); let b = i16x8::from([2_i16; 8]); let expected = i16x8::from([0, -1, 0, 0, 0, -1, 0, 0]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_i16x8_cmp_gt() { let a = i16x8::from([1, 2, 3, 4, 1, 2, 3, 4]); let b = i16x8::from([2_i16; 8]); let expected = i16x8::from([0, 0, -1, -1, 0, 0, -1, -1]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); } #[test] fn impl_i16x8_cmp_lt() { let a = i16x8::from([1, 2, 3, 4, 1, 2, 3, 4]); let b = i16x8::from([2_i16; 8]); let expected = i16x8::from([-1, 0, 0, 0, -1, 0, 0, 0]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); let expected = i16x8::from([0, 0, 0, 0, 0, 0, 0, 0]); let actual = a.cmp_lt(a); assert_eq!(expected, actual); } #[test] fn impl_i16x8_blend() { let use_t: i16 = -1; let t = i16x8::from([1, 2, 3, 4, 5, 6, 7, 8]); let f = i16x8::from([17, 18, 19, 20, 21, 22, 23, 24]); let mask = i16x8::from([use_t, 0, use_t, 0, use_t, 0, use_t, 0]); let expected = i16x8::from([1, 18, 3, 20, 5, 22, 7, 24]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_i16x8_abs() { let a = i16x8::from([1, -2, 3, -4, 5, -6, -7, i16::MIN]); let expected = i16x8::from([1, 2, 3, 4, 5, 6, 7, i16::MIN]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_i16x8_unsigned_abs() { let a = i16x8::from([1, -2, 3, -4, 5, -6, -7, i16::MIN]); let expected = u16x8::from([1, 2, 3, 4, 5, 6, 7, i16::MIN as u16]); let actual = a.unsigned_abs(); assert_eq!(expected, actual); } #[test] fn impl_i16x8_max() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN + 1, i16::MIN]); let b = i16x8::from([17, -18, 190, -20, 21, -22, 1, 1]); let expected = i16x8::from([17, 2, 190, 4, 21, 6, 1, 1]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i16x8, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_i16x8_min() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN + 1, i16::MIN]); let b = i16x8::from([17, -18, 190, -20, 21, -22, 1, 1]); let expected = i16x8::from([1, -18, 3, -20, 5, -22, i16::MIN + 1, i16::MIN]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i16x8, b| a.min(b), |a, b| a.min(b)); } #[test] fn test_from_u8x16_low() { let bytes = u8x16::from([1, 2, 3, 4, 5, 6, 7, u8::MAX, 9, 10, 11, 12, 13, 14, 15, 16]); let expected = i16x8::from([1, 2, 3, 4, 5, 6, 7, u8::MAX as i16]); let actual = i16x8::from_u8x16_low(bytes); assert_eq!(expected, actual); } #[test] fn test_from_u8x16_high() { let a = u8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 128]); let expected = i16x8::from([9, 10, 11, 12, 13, 14, 255, 128]); let actual = i16x8::from_u8x16_high(a); assert_eq!(expected, actual); } #[test] fn impl_from_i32x8_truncate() { let src = i32x8::new([10000, 1001, 2, 3, 4, 5, -65536, 65536]); let expected = i16x8::new([10000, 1001, 2, 3, 4, 5, 0, 0]); let result = i16x8::from_i32x8_truncate(src); assert_eq!(result, expected); } #[test] fn impl_from_i32x8_saturate() { let src = i32x8::new([10000, 1001, 2, 3, 4, 5, -65535, 65536]); let expected = i16x8::new([10000, 1001, 2, 3, 4, 5, -32768, 32767]); let result = i16x8::from_i32x8_saturate(src); assert_eq!(result, expected); } #[test] fn impl_from_i16_slice() { let src = [0, 1_i16, 2, 3, 4, 5, 6, 7, 8]; let result = i16x8::from_slice_unaligned(&src[1..9]); let expected = i16x8::new([1_i16, 2, 3, 4, 5, 6, 7, 8]); assert_eq!(result, expected); } #[test] fn test_i16x8_move_mask() { let a = i16x8::from([-1, 0, -2, -3, -1, 0, -2, -3]); let expected = 0b11011101; let actual = a.move_mask(); assert_eq!(expected, actual); // let a = i16x8::from([1, 0, 2, -3, 1, 0, 2, -3]); let expected = 0b10001000; let actual = a.move_mask(); assert_eq!(expected, actual); } #[test] fn test_i16x8_any() { let a = i16x8::from([0, 0, 0, -1, 0, 0, 0, 0]); assert!(a.any()); // let a = i16x8::from([0, 0, 0, 0, 0, 0, 0, 0]); assert!(!a.any()); } #[test] fn test_i16x8_all() { let a = i16x8::from([0, 0, 0, -1, 0, 0, 0, 0]); assert!(!a.all()); // let a = i16x8::from([-1; 8]); assert!(a.all()); } #[test] fn test_i16x8_none() { let a = i16x8::from([0, 0, 0, -1, 0, 0, 0, 0]); assert!(!a.none()); // let a = i16x8::from([0; 8]); assert!(a.none()); } #[test] fn impl_i16x8_reduce_add() { let p = i16x8::from([1, 2, 3, 4, 5, 6, 7, 9]); assert_eq!(p.reduce_add(), 37); } #[test] fn impl_dot_for_i16x8() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN + 1, i16::MIN]); let b = i16x8::from([17, -18, 190, -20, 21, -22, 3, 2]); let expected = i32x4::from([-19, 490, -27, -163837]); let actual = a.dot(b); assert_eq!(expected, actual); } #[test] fn impl_i16x8_reduce_min() { for i in 0..8 { let mut v = [i16::MAX; 8]; v[i] = i16::MIN; let p = i16x8::from(v); assert_eq!(p.reduce_min(), i16::MIN); } } #[test] fn impl_i16x8_reduce_max() { for i in 0..8 { let mut v = [i16::MIN; 8]; v[i] = i16::MAX; let p = i16x8::from(v); assert_eq!(p.reduce_min(), i16::MIN); } } #[test] fn impl_mul_keep_high() { let a = i16x8::from([i16::MAX, 200, 300, 4568, -1, -2, -3, -4]); let b = i16x8::from([i16::MIN, 600, 700, 8910, -15, -26, -37, 48]); let c: [i16; 8] = i16x8::mul_keep_high(a, b).into(); assert_eq!( c, [ (i32::from(i16::MAX) * i32::from(i16::MIN) >> 16) as i16, 1, 3, 621, 0, 0, 0, -1 ] ); crate::test_random_vector_vs_scalar( |a: i16x8, b| i16x8::mul_keep_high(a, b), |a, b| ((i32::from(a) * i32::from(b)) >> 16) as i16, ); } #[test] fn impl_i16x8_mul_widen() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN, i16::MAX]); let b = i16x8::from([17, -18, 190, -20, 21, -22, i16::MAX, i16::MAX]); let expected = i32x8::from([ 17, -36, 570, -80, 105, -132, (i16::MIN as i32) * (i16::MAX as i32), (i16::MAX as i32) * (i16::MAX as i32), ]); let actual = a.mul_widen(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: i16x8, b| a.mul_widen(b), |a, b| i32::from(a) * i32::from(b), ); } #[cfg(feature = "serde")] #[test] fn impl_i16x8_ser_de_roundtrip() { let serialized = bincode::serialize(&i16x8::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(i16x8::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_i32x4.rs000066400000000000000000000161471473735473700174660ustar00rootroot00000000000000use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_add_for_i32x4() { let a = i32x4::from([1, 2, i32::MAX - 1, i32::MAX - 1]); let b = i32x4::from([17, 18, 1, 2]); let expected = i32x4::from([18, 20, i32::MAX, i32::MIN]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_i32x4() { let a = i32x4::from([1, 2, i32::MIN + 1, i32::MIN]); let b = i32x4::from([17, -18, 1, 1]); let expected = i32x4::from([-16, 20, i32::MIN, i32::MAX]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_i32x4() { let a = i32x4::from([1, 2, i32::MIN + 1, i32::MIN]); let b = i32x4::from([17, -18, 1, 1]); let expected = i32x4::from([17, -36, i32::MIN + 1, i32::MIN]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_i32x4() { let a = i32x4::from([0, 0, 1, 1]); let b = i32x4::from([0, 1, 0, 1]); let expected = i32x4::from([0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_i32x4() { let a = i32x4::from([0, 0, 1, 1]); let b = i32x4::from([0, 1, 0, 1]); let expected = i32x4::from([0, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_i32x4() { let a = i32x4::from([0, 0, 1, 1]); let b = i32x4::from([0, 1, 0, 1]); let expected = i32x4::from([0, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_shl_for_i32x4() { let a = i32x4::from([1, 2, i32::MAX - 1, i32::MAX - 1]); let b = 2; let expected = i32x4::from([1 << 2, 2 << 2, (i32::MAX - 1) << 2, (i32::MAX - 1) << 2]); let actual = a << b; assert_eq!(expected, actual); } #[test] fn impl_shr_for_i32x4() { let a = i32x4::from([1, 2, i32::MAX - 1, i32::MAX - 1]); let b = 2; let expected = i32x4::from([1 >> 2, 2 >> 2, (i32::MAX - 1) >> 2, (i32::MAX - 1) >> 2]); let actual = a >> b; assert_eq!(expected, actual); } #[test] fn impl_i32x4_cmp_eq() { let a = i32x4::from([1, 2, 3, 4]); let b = i32x4::from([2_i32; 4]); let expected = i32x4::from([0, -1, 0, 0]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_i32x4_cmp_gt() { let a = i32x4::from([1, 2, 3, 4]); let b = i32x4::from([2_i32; 4]); let expected = i32x4::from([0, 0, -1, -1]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); } #[test] fn impl_i32x4_cmp_lt() { let a = i32x4::from([1, 2, 3, 4]); let b = i32x4::from([2_i32; 4]); let expected = i32x4::from([-1, 0, 0, 0]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); let expected = i32x4::from([0, 0, 0, 0]); let actual = a.cmp_lt(a); assert_eq!(expected, actual); } #[test] fn impl_i32x4_blend() { let use_t: i32 = -1; let t = i32x4::from([1, 2, 3, 4]); let f = i32x4::from([17, 18, 19, 20]); let mask = i32x4::from([use_t, 0, use_t, 0]); let expected = i32x4::from([1, 18, 3, 20]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_i32x4_abs() { let a = i32x4::from([-1, 2, -3, i32::MIN]); let expected = i32x4::from([1, 2, 3, i32::MIN]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_i32x4_unsigned_abs() { let a = i32x4::from([-1, 2, -3, i32::MIN]); let expected = u32x4::from([1, 2, 3, i32::MIN as u32]); let actual = a.unsigned_abs(); assert_eq!(expected, actual); } #[test] fn impl_i32x4_max() { let a = i32x4::from([1, 2, i32::MIN + 1, i32::MIN]); let b = i32x4::from([17, -18, 1, 1]); let expected = i32x4::from([17, 2, 1, 1]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i32x4, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_i32x4_min() { let a = i32x4::from([1, 2, i32::MIN + 1, i32::MIN]); let b = i32x4::from([17, -18, 1, 1]); let expected = i32x4::from([1, -18, i32::MIN + 1, i32::MIN]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i32x4, b| a.min(b), |a, b| a.min(b)); } #[test] fn impl_i32x4_round_float() { let a = i32x4::from([-1, 30, i32::MIN, i32::MAX]); let expected = f32x4::from([-1.0, 30.0, i32::MIN as f32, i32::MAX as f32]); let actual = a.round_float(); assert_eq!(expected, actual); } #[test] fn test_i32x4_move_mask() { let a = i32x4::from([-1, 0, -2, -3]); let expected = 0b1101; let actual = a.move_mask(); assert_eq!(expected, actual); // let a = i32x4::from([i32::MAX, 0, 2, -3]); let expected = 0b1000; let actual = a.move_mask(); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar_reduce( |a: i32x4| a.move_mask(), 0i32, |acc, a, idx| acc | if a < 0 { 1 << idx } else { 0 }, ); } #[test] fn test_i32x4_any() { let a = i32x4::from([0, 0, 0, -1]); assert!(a.any()); // let a = i32x4::from([0, 0, 0, 0]); assert!(!a.any()); } #[test] fn test_i32x4_all() { let a = i32x4::from([0, 0, 0, -1]); assert!(!a.all()); // let a = i32x4::from([-1; 4]); assert!(a.all()); } #[test] fn test_i32x4_none() { let a = i32x4::from([0, 0, 0, -1]); assert!(!a.none()); // let a = i32x4::from([0; 4]); assert!(a.none()); } #[test] fn impl_i32x4_reduce_add() { let p = i32x4::from([10000000, 20000000, 30000000, -40000000]); assert_eq!(p.reduce_add(), 20000000); } #[test] fn impl_i32x4_reduce_min() { for i in 0..4 { let mut v = [i32::MAX; 4]; v[i] = i32::MIN; let p = i32x4::from(v); assert_eq!(p.reduce_min(), i32::MIN); } } #[test] fn impl_i32x4_reduce_max() { for i in 0..4 { let mut v = [i32::MIN; 4]; v[i] = i32::MAX; let p = i32x4::from(v); assert_eq!(p.reduce_max(), i32::MAX); } } #[test] fn impl_i32x4_shr_each() { let a = i32x4::from([15313, 52322, -1, 4]); let shift = i32x4::from([1, 30, 8, 33 /* test masking behavior */]); let expected = i32x4::from([7656, 0, -1, 2]); let actual = a >> shift; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: i32x4, b| a >> b, |a, b| a.wrapping_shr(b as u32), ); } #[test] fn impl_i32x4_shl_each() { let a = i32x4::from([15313, 52322, -1, 4]); let shift = i32x4::from([1, 30, 8, 33 /* test masking behavior */]); let expected = i32x4::from([30626, -2147483648, -256, 8]); let actual = a << shift; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: i32x4, b| a << b, |a, b| a.wrapping_shl(b as u32), ); } #[test] fn impl_i32x4_mul_widen() { let a = i32x4::from([1, 2, 3 * -1000000, i32::MAX]); let b = i32x4::from([5, 6, 7 * -1000000, i32::MIN]); let expected = i64x4::from([ 1 * 5, 2 * 6, 3 * 7 * 1000000 * 1000000, i32::MIN as i64 * i32::MAX as i64, ]); let actual = a.mul_widen(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: i32x4, b| a.mul_widen(b), |a, b| a as i64 * b as i64, ); } #[cfg(feature = "serde")] #[test] fn impl_i32x4_ser_de_roundtrip() { let serialized = bincode::serialize(&i32x4::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(i32x4::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_i32x8.rs000066400000000000000000000233751473735473700174730ustar00rootroot00000000000000use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] fn impl_add_for_i32x8() { let a = i32x8::from([1, 2, i32::MAX - 1, i32::MAX - 1, 15, 20, 5000, 2990]); let b = i32x8::from([17, 18, 1, 2, 20, 5, 900, 900]); let expected = i32x8::from([18, 20, i32::MAX, i32::MIN, 35, 25, 5900, 3890]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_i32x8() { let a = i32x8::from([1, 2, i32::MIN + 1, i32::MIN, 15, 20, 5000, 2990]); let b = i32x8::from([17, -18, 1, 1, 20, 5, 900, 900]); let expected = i32x8::from([-16, 20, i32::MIN, i32::MAX, -5, 15, 4100, 2090]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_i32x8() { let a = i32x8::from([1, 2, i32::MIN + 1, i32::MIN, 2, 3, 4, 5]); let b = i32x8::from([17, -18, 1, 1, -1, -2, -6, 3]); let expected = i32x8::from([17, -36, i32::MIN + 1, i32::MIN, -2, -6, -24, 15]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_i32x8() { let a = i32x8::from([0, 0, 1, 1, 1, 0, 0, 1]); let b = i32x8::from([0, 1, 0, 1, 0, 1, 1, 1]); let expected = i32x8::from([0, 0, 0, 1, 0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_i32x8() { let a = i32x8::from([0, 0, 1, 1, 1, 0, 0, 1]); let b = i32x8::from([0, 1, 0, 1, 0, 1, 1, 1]); let expected = i32x8::from([0, 1, 1, 1, 1, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_i32x8() { let a = i32x8::from([0, 0, 1, 1, 1, 0, 0, 1]); let b = i32x8::from([0, 1, 0, 1, 0, 1, 1, 1]); let expected = i32x8::from([0, 1, 1, 0, 1, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_shl_for_i32x8() { let a = i32x8::from([1, 2, i32::MAX - 1, i32::MAX - 1, 128, 255, 590, 5667]); let b = 2; let expected = i32x8::from([ 1 << 2, 2 << 2, (i32::MAX - 1) << 2, (i32::MAX - 1) << 2, 128 << 2, 255 << 2, 590 << 2, 5667 << 2, ]); let actual = a << b; assert_eq!(expected, actual); } #[test] fn impl_shr_for_i32x8() { let a = i32x8::from([1, 2, i32::MAX - 1, i32::MAX - 1, 128, 255, 590, 5667]); let b = 2; let expected = i32x8::from([ 1 >> 2, 2 >> 2, (i32::MAX - 1) >> 2, (i32::MAX - 1) >> 2, 128 >> 2, 255 >> 2, 590 >> 2, 5667 >> 2, ]); let actual = a >> b; assert_eq!(expected, actual); } #[test] fn impl_i32x8_cmp_eq() { let a = i32x8::from([1, 2, 3, 4, 2, 1, 8, 2]); let b = i32x8::from([2_i32; 8]); let expected = i32x8::from([0, -1, 0, 0, -1, 0, 0, -1]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_i32x8_cmp_gt() { let a = i32x8::from([1, 2, 9, 4, 1, 2, 8, 10]); let b = i32x8::from([5_i32; 8]); let expected = i32x8::from([0, 0, -1, 0, 0, 0, -1, -1]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); } #[test] fn impl_i32x8_cmp_lt() { let a = i32x8::from([1, 2, 9, 4, 1, 2, 8, 10]); let b = i32x8::from([5_i32; 8]); let expected = i32x8::from([-1, -1, 0, -1, -1, -1, 0, 0]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); let expected = i32x8::from([0, 0, 0, 0, 0, 0, 0, 0]); let actual = a.cmp_lt(a); assert_eq!(expected, actual); } #[test] fn impl_i32x8_blend() { let use_t: i32 = -1; let t = i32x8::from([1, 2, 3, 4, 5, 6, 7, 8]); let f = i32x8::from([17, 18, 19, 20, 25, 30, 50, 90]); let mask = i32x8::from([use_t, 0, use_t, 0, 0, 0, 0, use_t]); let expected = i32x8::from([1, 18, 3, 20, 25, 30, 50, 8]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_i32x8_abs() { let a = i32x8::from([-1, 2, -3, i32::MIN, 6, -15, -19, 9]); let expected = i32x8::from([1, 2, 3, i32::MIN, 6, 15, 19, 9]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_i32x8_unsigned_abs() { let a = i32x8::from([-1, 2, -3, i32::MIN, 6, -15, -19, 9]); let expected = u32x8::from([1, 2, 3, i32::MIN as u32, 6, 15, 19, 9]); let actual = a.unsigned_abs(); assert_eq!(expected, actual); } #[test] fn impl_i32x8_max() { let a = i32x8::from([1, 2, i32::MIN + 1, i32::MIN, 6, -8, 12, 9]); let b = i32x8::from([17, -18, 1, 1, 19, -5, -1, -9]); let expected = i32x8::from([17, 2, 1, 1, 19, -5, 12, 9]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i32x8, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_i32x8_min() { let a = i32x8::from([1, 2, i32::MIN + 1, i32::MIN, 6, -8, 12, 9]); let b = i32x8::from([17, -18, 1, 1, 19, -5, -1, -9]); let expected = i32x8::from([1, -18, i32::MIN + 1, i32::MIN, 6, -8, -1, -9]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i32x8, b| a.min(b), |a, b| a.min(b)); } #[test] fn impl_i32x8_round_float() { let a = i32x8::from([-1, 30, i32::MIN, i32::MAX, 29, 35, -8, 0]); let expected = f32x8::from([ -1.0, 30.0, i32::MIN as f32, i32::MAX as f32, 29.0, 35.0, -8.0, 0.0, ]); let actual = a.round_float(); assert_eq!(expected, actual); } #[test] fn impl_transpose_for_i32x8() { let a = [ i32x8::new([0, 1, 2, 3, 4, 5, 6, 7]), i32x8::new([8, 9, 10, 11, 12, 13, 14, 15]), i32x8::new([16, 17, 18, 19, 20, 21, 22, 23]), i32x8::new([24, 25, 26, 27, 28, 29, 30, 31]), i32x8::new([32, 33, 34, 35, 36, 37, 38, 39]), i32x8::new([40, 41, 42, 43, 44, 45, 46, 47]), i32x8::new([48, 49, 50, 51, 52, 53, 54, 55]), i32x8::new([ 5600000, 5700000, 5800000, 5900000, 6000000, 6100000, 6200000, 6300000, ]), ]; let result = i32x8::transpose(a); let expected = [ i32x8::new([0, 8, 16, 24, 32, 40, 48, 5600000]), i32x8::new([1, 9, 17, 25, 33, 41, 49, 5700000]), i32x8::new([2, 10, 18, 26, 34, 42, 50, 5800000]), i32x8::new([3, 11, 19, 27, 35, 43, 51, 5900000]), i32x8::new([4, 12, 20, 28, 36, 44, 52, 6000000]), i32x8::new([5, 13, 21, 29, 37, 45, 53, 6100000]), i32x8::new([6, 14, 22, 30, 38, 46, 54, 6200000]), i32x8::new([7, 15, 23, 31, 39, 47, 55, 6300000]), ]; assert_eq!(result, expected); } #[test] fn impl_from_i16x8() { let a = i16x8::from([1, 2, 3, 4, 5, 6, i16::MIN + 1, i16::MIN]); let actual = i32x8::from_i16x8(a); let expected = i32x8::from([1, 2, 3, 4, 5, 6, (i16::MIN + 1) as i32, i16::MIN as i32]); assert_eq!(actual, expected); } #[test] fn impl_from_u16x8() { let a = u16x8::from([1, 2, 3, 4, 5, i16::MAX as u16, u16::MAX - 1, u16::MAX]); let actual = i32x8::from_u16x8(a); let expected = i32x8::from([ 1, 2, 3, 4, 5, i16::MAX as i32, (u16::MAX - 1) as i32, u16::MAX as i32, ]); assert_eq!(actual, expected); crate::test_random_vector_vs_scalar( |a: u16x8, _b| i32x8::from_u16x8(a), |a, _b| a as u32 as i32, ); } #[test] fn test_i16x8_move_mask() { let a = i16x8::from([-1, 0, -2, -3, -1, 0, -2, -3]); let expected = 0b11011101; let actual = a.move_mask(); assert_eq!(expected, actual); // let a = i16x8::from([1, 0, 2, -3, 1, 0, 2, -3]); let expected = 0b10001000; let actual = a.move_mask(); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar_reduce( |a: i32x8| a.move_mask(), 0i32, |acc, a, idx| acc | if a < 0 { 1 << idx } else { 0 }, ); } #[test] fn test_i32x8_any() { let a = i32x8::from([0, 0, 0, -1, 0, 0, 0, 0]); assert!(a.any()); // let a = i32x8::from([0, 0, 0, 0, 0, 0, 0, 0]); assert!(!a.any()); crate::test_random_vector_vs_scalar_reduce( |a: i32x8| a.any(), false, |acc, a, _idx| acc | (a < 0), ); } #[test] fn test_i32x8_all() { let a = i32x8::from([0, 0, 0, -1, 0, 0, 0, 0]); assert!(!a.all()); // let a = i32x8::from([-1; 8]); assert!(a.all()); crate::test_random_vector_vs_scalar_reduce( |a: i32x8| a.all(), true, |acc, a, _idx| acc & (a < 0), ); } #[test] fn test_i32x8_none() { let a = i32x8::from([0, 0, 0, -1, 0, 0, 0, 0]); assert!(!a.none()); // let a = i32x8::from([0; 8]); assert!(a.none()); crate::test_random_vector_vs_scalar_reduce( |a: i32x8| a.none(), true, |acc, a, _idx| acc & !(a < 0), ); } #[test] fn impl_i32x8_reduce_add() { let p = i32x8::from([ 10000000, 20000000, 30000000, 40000000, 50000000, 60000000, 70000000, 90000000, ]); assert_eq!(p.reduce_add(), 370000000); } #[test] fn impl_i32x8_reduce_min() { for i in 0..8 { let mut v = [i32::MAX; 8]; v[i] = i32::MIN; let p = i32x8::from(v); assert_eq!(p.reduce_min(), i32::MIN); } } #[test] fn impl_i32x8_reduce_max() { for i in 0..8 { let mut v = [i32::MIN; 8]; v[i] = i32::MAX; let p = i32x8::from(v); assert_eq!(p.reduce_max(), i32::MAX); } } #[test] fn impl_i32x4_shr_each() { let a = u32x8::from([15313, 52322, u32::MAX, 4, 10, 20, 30, 40]); let shift = u32x8::from([1, 30, 8, 33 /* test masking behavior */, 1, 2, 3, 4]); let expected = u32x8::from([7656, 0, 16777215, 2, 5, 5, 3, 2]); let actual = a >> shift; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: i32x8, b| a >> b, |a, b| a.wrapping_shr(b as u32), ); } #[test] fn impl_i32x8_shl_each() { let a = i32x8::from([15313, 52322, -1, 4, 1, 2, 3, 4]); let shift = i32x8::from([1, 30, 8, 33 /* test masking behavior */, 1, 2, 3, 4]); let expected = i32x8::from([30626, -2147483648, -256, 8, 2, 8, 24, 64]); let actual = a << shift; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: i32x8, b| a << b, |a, b| a.wrapping_shl(b as u32), ); } #[cfg(feature = "serde")] #[test] fn impl_i32x8_ser_de_roundtrip() { let serialized = bincode::serialize(&i32x8::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(i32x8::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_i64x2.rs000066400000000000000000000073731473735473700174720ustar00rootroot00000000000000use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_add_for_i64x2() { let a = i64x2::from([i64::MAX - 1, i64::MAX - 1]); let b = i64x2::from([1, 2]); let expected = i64x2::from([i64::MAX, i64::MIN]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_i64x2() { let a = i64x2::from([i64::MIN + 1, i64::MIN]); let b = i64x2::from([1, 1]); let expected = i64x2::from([i64::MIN, i64::MAX]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_i64x2() { let a = i64x2::from([i64::MIN + 1, 24]); let b = i64x2::from([1, -26]); let expected = i64x2::from([i64::MIN + 1, 24 * -26]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_i64x2() { let a = i64x2::from([1, 1]); let b = i64x2::from([0, 1]); let expected = i64x2::from([0, 1]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_i64x2() { let a = i64x2::from([1, 1]); let b = i64x2::from([0, 1]); let expected = i64x2::from([1, 1]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_i64x2() { let a = i64x2::from([1, 1]); let b = i64x2::from([0, 1]); let expected = i64x2::from([1, 0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_shl_for_i64x2() { let a = i64x2::from([i64::MAX - 1, i64::MAX - 1]); let b = 2; let expected = i64x2::from([(i64::MAX - 1) << 2, (i64::MAX - 1) << 2]); let actual = a << b; assert_eq!(expected, actual); } #[test] fn impl_i64x2_blend() { let use_t: i64 = -1; let t = i64x2::from([1, 2]); let f = i64x2::from([17, 18]); let mask = i64x2::from([use_t, 0]); let expected = i64x2::from([1, 18]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_i64x2_abs() { let a = i64x2::from([-1, i64::MIN]); let expected = i64x2::from([1, i64::MIN]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_i64x2_unsigned_abs() { let a = i64x2::from([-1, i64::MIN]); let expected = u64x2::from([1, i64::MIN as u64]); let actual = a.unsigned_abs(); assert_eq!(expected, actual); } #[test] fn impl_i64x2_cmp_eq() { let a = i64x2::from([1_i64, 4]); let b = i64x2::from([3_i64, 4]); let expected = i64x2::from([0, -1]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_i64x2_cmp_gt() { let a = i64x2::from([3_i64, 4]); let b = i64x2::from([1_i64, 4]); let expected = i64x2::from([-1, 0]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); } #[test] fn test_i64x2_any() { let a = i64x2::from([3, -1]); assert!(a.any()); // let a = i64x2::from([1, 0]); assert!(!a.any()); } #[test] fn test_i64x2_all() { let a = i64x2::from([-1, -1]); assert!(a.all(), "{:?}", a); // let a = i64x2::from([1, -1]); assert!(!a.all()); } #[test] fn test_i64x2_none() { let a = i64x2::from([1, 0]); assert!(a.none()); // let a = i64x2::from([1, -1]); assert!(!a.none()); } #[test] fn test_i64x2_move_mask() { let a = i64x2::from([-1, 0]); let expected = 0b01; let actual = a.move_mask(); assert_eq!(expected, actual); // let a = i64x2::from([1, -1]); let expected = 0b10; let actual = a.move_mask(); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar_reduce( |a: i64x2| a.move_mask(), 0i32, |acc, a, idx| acc | if a < 0 { 1 << idx } else { 0 }, ); } #[cfg(feature = "serde")] #[test] fn impl_i64x2_ser_de_roundtrip() { let serialized = bincode::serialize(&i64x2::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(i64x2::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_i64x4.rs000066400000000000000000000110101473735473700174530ustar00rootroot00000000000000use std::num::Wrapping; use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] fn impl_add_for_i64x4() { let a = i64x4::from([i64::MAX - 1, i64::MAX - 1, 6, 9]); let b = i64x4::from([1, 2, 3, 4]); let expected = i64x4::from([i64::MAX, i64::MIN, 9, 13]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_i64x4() { let a = i64x4::from([1, 0, 9, 12]); let b = i64x4::from([1, 1, 3, 3]); let expected = i64x4::from([0, -1, 6, 9]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_i64x4() { let a = i64x4::from([i64::MIN + 1, 24, 5402, i64::MAX]); let b = i64x4::from([1, -26, -5402, 2]); let expected = i64x4::from([ i64::MIN + 1, 24 * -26, 5402 * -5402, (Wrapping(i64::MAX) * Wrapping(2)).0, ]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_i64x4() { let a = i64x4::from([1, 1, 0, 0]); let b = i64x4::from([0, 1, 0, 1]); let expected = i64x4::from([0, 1, 0, 0]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_i64x4() { let a = i64x4::from([1, 1, 0, 0]); let b = i64x4::from([0, 1, 0, 1]); let expected = i64x4::from([1, 1, 0, 1]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_i64x4() { let a = i64x4::from([1, 1, 1, 0]); let b = i64x4::from([0, 1, 0, 1]); let expected = i64x4::from([1, 0, 1, 1]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_shl_for_i64x4() { let a = i64x4::from([i64::MAX - 1, i64::MAX - 1, 65535, 0]); let b = 2; let expected = i64x4::from([(i64::MAX - 1) << 2, (i64::MAX - 1) << 2, 65535 << 2, 0 << 2]); let actual = a << b; assert_eq!(expected, actual); } #[test] fn impl_shr_for_i64x4() { let a = i64x4::from([i64::MAX - 1, i64::MAX - 1, 65535, 0]); let b = 2; let expected = i64x4::from([(i64::MAX - 1) >> 2, (i64::MAX - 1) >> 2, 65535 >> 2, 0 >> 2]); let actual = a >> b; assert_eq!(expected, actual); } #[test] fn impl_i64x4_blend() { let use_t: i64 = i64::MAX; let t = i64x4::from([1, 2, 3, 4]); let f = i64x4::from([17, 18, 21, 45]); let mask = i64x4::from([use_t, 0, 0, use_t]); let expected = i64x4::from([1, 18, 21, 4]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_i64x4_abs() { let a = i64x4::from([-1, 2, -3, i64::MIN]); let expected = i64x4::from([1, 2, 3, i64::MIN]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_i64x4_unsigned_abs() { let a = i64x4::from([-1, 2, -3, i64::MIN]); let expected = u64x4::from([1, 2, 3, i64::MIN as u64]); let actual = a.unsigned_abs(); assert_eq!(expected, actual); } #[test] fn impl_i64x4_cmp_eq() { let a = i64x4::from([1_i64, 4, i64::MAX, 5]); let b = i64x4::from([3_i64, 4, i64::MAX, 1]); let expected = i64x4::from([0, -1, -1, 0]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn test_i64x4_move_mask() { let a = i64x4::from([-1, 0, -2, -3]); let expected = 0b1101; let actual = a.move_mask(); assert_eq!(expected, actual); // let a = i64x4::from([i64::MAX, 0, 2, -3]); let expected = 0b1000; let actual = a.move_mask(); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar_reduce( |a: i64x4| a.move_mask(), 0i32, |acc, a, idx| acc | if a < 0 { 1 << idx } else { 0 }, ); } #[test] fn test_i64x4_any() { let a = i64x4::from([0, 0, 0, -1]); assert!(a.any()); // let a = i64x4::from([0, 0, 0, 0]); assert!(!a.any()); crate::test_random_vector_vs_scalar_reduce( |a: i64x4| a.any(), false, |acc, a, _idx| acc | acc | (a < 0), ); } #[test] fn test_i32x4_all() { let a = i64x4::from([0, 0, 0, -1]); assert!(!a.all()); // let a = i64x4::from([-1; 4]); assert!(a.all()); crate::test_random_vector_vs_scalar_reduce( |a: i64x4| a.all(), true, |acc, a, _idx| acc & (a < 0), ); } #[test] fn test_i32x4_none() { let a = i64x4::from([0, 0, 0, -1]); assert!(!a.none()); // let a = i64x4::from([0; 4]); assert!(a.none()); crate::test_random_vector_vs_scalar_reduce( |a: i64x4| a.none(), true, |acc, a, _idx| acc & !(a < 0), ); } #[cfg(feature = "serde")] #[test] fn impl_i64x4_ser_de_roundtrip() { let serialized = bincode::serialize(&i64x4::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(i64x4::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_i8x16.rs000066400000000000000000000231701473735473700174660ustar00rootroot00000000000000use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_add_for_i8x16() { let a = i8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127]); let b = i8x16::from([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1]); let expected = i8x16::from([ 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 127, -128, ]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_i8x16() { let a = i8x16::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, i8::MIN + 1, i8::MIN, ]); let b = i8x16::from([17, 27, -1, 20, 21, -8, 23, 0, 1, 2, -9, 28, 64, 30, 1, 1]); let expected = i8x16::from([ -16, -25, 4, -16, -16, 14, -16, 8, 8, 8, 20, -16, -51, -16, i8::MIN, i8::MAX, ]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_saturating_add_for_i8x16() { let a = i8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127]); let b = i8x16::from([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1]); let expected = i8x16::from([ 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 127, 127, ]); let actual = a.saturating_add(b); assert_eq!(expected, actual); } #[test] fn impl_saturating_sub_for_i8x16() { let a = i8x16::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, i8::MIN + 1, i8::MIN, ]); let b = i8x16::from([17, 27, -1, 20, 21, -8, 23, 0, 1, 2, -9, 28, 64, 30, 1, 1]); let expected = i8x16::from([ -16, -25, 4, -16, -16, 14, -16, 8, 8, 8, 20, -16, -51, -16, i8::MIN, i8::MIN, ]); let actual = a.saturating_sub(b); assert_eq!(expected, actual); } #[test] fn impl_bitand_for_i8x16() { let a = i8x16::from([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]); let b = i8x16::from([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); let expected = i8x16::from([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_i8x16() { let a = i8x16::from([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]); let b = i8x16::from([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); let expected = i8x16::from([0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_i8x16() { let a = i8x16::from([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]); let b = i8x16::from([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); let expected = i8x16::from([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_i8x16_cmp_eq() { let a = i8x16::from([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]); let b = i8x16::from([2_i8; 16]); let expected = i8x16::from([0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_i8x16_cmp_gt() { let a = i8x16::from([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]); let b = i8x16::from([2_i8; 16]); let expected = i8x16::from([0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); } #[test] fn impl_i8x16_cmp_lt() { let a = i8x16::from([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]); let b = i8x16::from([2_i8; 16]); let expected = i8x16::from([-1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); let expected = i8x16::from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); let actual = a.cmp_lt(a); assert_eq!(expected, actual); } #[test] fn impl_i8x16_blend() { let use_t: i8 = -1; let t = i8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127]); let f = i8x16::from([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1]); let mask = i8x16::from([ use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, ]); let expected = i8x16::from([1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 126, 1]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_i8x16_abs() { let a = i8x16::from([ -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN, ]); let expected = i8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, i8::MIN]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_i8x16_unsigned_abs() { let a = i8x16::from([ -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN, ]); let expected = u8x16::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, i8::MIN as u8, ]); let actual = a.unsigned_abs(); assert_eq!(expected, actual); } #[test] fn impl_i8x16_max() { let a = i8x16::from([10, 2, -3, 4, 5, -6, 7, 8, 9, 7, -11, 12, 13, 6, 55, i8::MIN]); let b = i8x16::from([ -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN + 1, ]); let expected = i8x16::from([10, 2, -3, 4, 5, -6, 7, 8, 9, 7, -11, 12, 13, 6, 55, -127]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i8x16, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_i8x16_min() { let a = i8x16::from([10, 2, -3, 4, 5, -6, 7, 8, 9, 7, -11, 12, 13, 6, 55, i8::MIN]); let b = i8x16::from([ -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN + 1, ]); let expected = i8x16::from([ -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN, ]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i8x16, b| a.min(b), |a, b| a.min(b)); } #[test] fn impl_from_i16x16_truncate() { let src = i16x16::new([ 10000, 1001, 2, 3, 4, 5, 6, 32767, 10000, 1001, 2, 128, -129, -128, 127, 255, ]); let expected = i8x16::new([ 16, -23, 2, 3, 4, 5, 6, -1, 16, -23, 2, -128, 127, -128, 127, -1, ]); let result = i8x16::from_i16x16_truncate(src); assert_eq!(result, expected); } #[test] fn impl_from_i16x16_saturate() { let src = i16x16::new([ 10000, 1001, 2, 3, 4, 5, 6, 32767, 10000, 1001, 2, 128, -129, -128, 127, 255, ]); let expected = i8x16::new([ 127, 127, 2, 3, 4, 5, 6, 127, 127, 127, 2, 127, -128, -128, 127, 127, ]); let result = i8x16::from_i16x16_saturate(src); assert_eq!(result, expected); } #[test] fn test_i8x16_move_mask() { let a = i8x16::from([-1, 0, -2, -3, -1, 0, -2, -3, -1, 0, -1, 0, -1, 0, -1, 0]); let expected = 0b0101010111011101; let actual = a.move_mask(); assert_eq!(expected, actual); } #[test] fn test_i8x16_any() { let a = i8x16::from([0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert!(a.any()); let a = i8x16::from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0]); assert!(a.any()); // let a = i8x16::from([0; 16]); assert!(!a.any()); } #[test] fn test_i8x16_all() { let a = i8x16::from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0]); assert!(!a.all()); // let a = i8x16::from([-1; 16]); assert!(a.all()); } #[test] fn test_i8x16_none() { let a = i8x16::from([0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); assert!(!a.none()); let a = i8x16::from([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0]); assert!(!a.none()); // let a = i8x16::from([0; 16]); assert!(a.none()); } #[test] fn impl_from_i8_slice() { let src = [0, 1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; let result = i8x16::from_slice_unaligned(&src[1..17]); let expected = i8x16::new([1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); assert_eq!(result, expected); } #[test] fn test_i8x16_swizzle() { let a = i8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let b = i8x16::from([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]); let expected = i8x16::from([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]); let actual = a.swizzle(b); assert_eq!(expected, actual); let b = i8x16::from([15, 17, -13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, -1, 0]); let expected = i8x16::from([16, 0, 0, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 0, 1]); let actual = a.swizzle(b); assert_eq!(expected, actual); } #[test] fn test_i8x16_swizzle_relaxed() { let a = i8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); let b = i8x16::from([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]); let expected = i8x16::from([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]); let actual = a.swizzle_relaxed(b); assert_eq!(expected, actual); let b = i8x16::from([15, -17, -13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, -1, 0]); let expected = i8x16::from([16, 0, 0, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 0, 1]); let actual = a.swizzle_relaxed(b); assert_eq!(expected, actual); } #[cfg(feature = "serde")] #[test] fn impl_i8x16_ser_de_roundtrip() { let serialized = bincode::serialize(&i8x16::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(i8x16::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_i8x32.rs000066400000000000000000000263061473735473700174700ustar00rootroot00000000000000use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] fn impl_add_for_i8x32() { let a = i8x32::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127, ]); let b = i8x32::from([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1, ]); let expected = i8x32::from([ 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 127, -128, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 127, -128, ]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_i8x32() { let a = i8x32::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, i8::MIN + 1, i8::MIN, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, i8::MIN + 1, i8::MIN, ]); let b = i8x32::from([ 17, 27, -1, 20, 21, -8, 23, 0, 1, 2, -9, 28, 64, 30, 1, 1, 17, 27, -1, 20, 21, -8, 23, 0, 1, 2, -9, 28, 64, 30, 1, 1, ]); let expected = i8x32::from([ -16, -25, 4, -16, -16, 14, -16, 8, 8, 8, 20, -16, -51, -16, -128, 127, -16, -25, 4, -16, -16, 14, -16, 8, 8, 8, 20, -16, -51, -16, -128, 127, ]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_saturating_add_for_i8x32() { let a = i8x32::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127, ]); let b = i8x32::from([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1, ]); let expected = i8x32::from([ 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 127, 127, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 127, 127, ]); let actual = a.saturating_add(b); assert_eq!(expected, actual); } #[test] fn impl_saturating_sub_for_i8x32() { let a = i8x32::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, i8::MIN + 1, i8::MIN, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, i8::MIN + 1, i8::MIN, ]); let b = i8x32::from([ 17, 27, -1, 20, 21, -8, 23, 0, 1, 2, -9, 28, 64, 30, 1, 1, 17, 27, -1, 20, 21, -8, 23, 0, 1, 2, -9, 28, 64, 30, 1, 1, ]); let expected = i8x32::from([ -16, -25, 4, -16, -16, 14, -16, 8, 8, 8, 20, -16, -51, -16, -128, -128, -16, -25, 4, -16, -16, 14, -16, 8, 8, 8, 20, -16, -51, -16, -128, -128, ]); let actual = a.saturating_sub(b); assert_eq!(expected, actual); } #[test] fn impl_bitand_for_i8x32() { let a = i8x32::from([ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, ]); let b = i8x32::from([ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, ]); let expected = i8x32::from([ 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, ]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_i8x32() { let a = i8x32::from([ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, ]); let b = i8x32::from([ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, ]); let expected = i8x32::from([ 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, ]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_i8x32() { let a = i8x32::from([ 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, ]); let b = i8x32::from([ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, ]); let expected = i8x32::from([ 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_i8x32_cmp_eq() { let a = i8x32::from([ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, ]); let b = i8x32::from([2_i8; 32]); let expected = i8x32::from([ 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, ]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_i8x32_cmp_gt() { let a = i8x32::from([ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, ]); let b = i8x32::from([2_i8; 32]); let expected = i8x32::from([ 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, ]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); } #[test] fn impl_i8x32_cmp_lt() { let a = i8x32::from([ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, ]); let b = i8x32::from([2_i8; 32]); let expected = i8x32::from([ -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, ]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); let expected = i8x32::from([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); let actual = a.cmp_lt(a); assert_eq!(expected, actual); } #[test] fn impl_i8x32_blend() { let use_t: i8 = -1; let t = i8x32::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127, ]); let f = i8x32::from([ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1, ]); let mask = i8x32::from([ use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, ]); let expected = i8x32::from([ 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 126, 1, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 126, 1, ]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_i8x32_abs() { let a = i8x32::from([ -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN, -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN, ]); let expected = i8x32::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, i8::MIN, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, i8::MIN, ]); let actual = a.abs(); assert_eq!(expected, actual); } #[test] fn impl_i8x32_max() { let a = i8x32::from([ 10, 2, -3, 4, 5, -6, 7, 8, 9, 7, -11, 12, 13, 6, 55, i8::MIN, 10, 2, -3, 4, 5, -6, 7, 8, 9, 7, -11, 12, 13, 6, 55, i8::MIN, ]); let b = i8x32::from([ -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN + 1, -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN + 1, ]); let expected = i8x32::from([ 10, 2, -3, 4, 5, -6, 7, 8, 9, 7, -11, 12, 13, 6, 55, -127, 10, 2, -3, 4, 5, -6, 7, 8, 9, 7, -11, 12, 13, 6, 55, -127, ]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i8x32, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_i8x32_min() { let a = i8x32::from([ 10, 2, -3, 4, 5, -6, 7, 8, 9, 7, -11, 12, 13, 6, 55, i8::MIN, 10, 2, -3, 4, 5, -6, 7, 8, 9, 7, -11, 12, 13, 6, 55, i8::MIN, ]); let b = i8x32::from([ -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN + 1, -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN + 1, ]); let expected = i8x32::from([ -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN, -1, 2, -3, 4, 5, -6, 7, 8, 9, -10, -11, 12, 13, -14, -126, i8::MIN, ]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: i8x32, b| a.min(b), |a, b| a.min(b)); } #[test] fn test_i8x32_move_mask() { let a = i8x32::from([ -1, 0, -2, -3, -1, 0, -2, -3, -1, 0, -1, 0, -1, 0, -1, 0, -1, -1, -1, -1, -1, -1, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, ]); let expected = 0b01010101011111110101010111011101; let actual = a.move_mask(); assert_eq!(expected, actual); } #[test] fn test_i8x32_any() { let a = i8x32::from([ 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert!(a.any()); let a = i8x32::from([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, ]); assert!(a.any()); // let a = i8x32::from([0; 32]); assert!(!a.any()); } #[test] fn test_i8x32_all() { let a = i8x32::from([ 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert!(!a.all()); // let a = i8x32::from([-1; 32]); assert!(a.all()); } #[test] fn test_i8x32_none() { let a = i8x32::from([ 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]); assert!(!a.none()); let a = i8x32::from([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, ]); assert!(!a.none()); // let a = i8x32::from([0; 32]); assert!(a.none()); } #[test] fn test_i8x32_swizzle_half() { let a = i8x32::from([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, ]); let b = i8x32::from([ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, ]); let expected = i8x32::from([ 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, ]); let actual = a.swizzle_half(b); assert_eq!(expected, actual); } #[cfg(feature = "serde")] #[test] fn impl_i8x32_ser_de_roundtrip() { let serialized = bincode::serialize(&i8x32::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(i8x32::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_u16x16.rs000066400000000000000000000205211473735473700175560ustar00rootroot00000000000000use std::num::Wrapping; use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] fn impl_add_for_u16x16() { let a = u16x16::from([ 1, 2, i16::MAX as u16 - 1, i16::MAX as u16 - 1, 15, 20, 5000, 2990, 1, 2, i16::MAX as u16 - 1, i16::MAX as u16 - 1, 15, 20, 5000, 2990, ]); let b = u16x16::from([ 17, 18, 1, 2, 20, 5, 900, 900, 17, 18, 1, 2, 20, 5, 900, 900, ]); let expected = u16x16::from([ 18, 20, i16::MAX as u16, i16::MIN as u16, 35, 25, 5900, 3890, 18, 20, i16::MAX as u16, i16::MIN as u16, 35, 25, 5900, 3890, ]); let actual = a + b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x16, b| a + b, |a, b| a.wrapping_add(b), ); } #[test] fn impl_sub_for_u16x16() { let a = u16x16::from([ 1, 2, 1, 2, 15, 20, 5000, 2990, 1, 2, u16::MAX, u16::MAX - 1, 15, 20, 5000, 2990, ]); let b = u16x16::from([ 17, 18, 1, 1, 20, 5, 900, 900, 17, 18, 1, 1, 20, 5, 900, 900, ]); let expected = u16x16::from([ 65520, 65520, 0, 1, 65531, 15, 4100, 2090, 65520, 65520, 65534, 65533, 65531, 15, 4100, 2090, ]); let actual = a - b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x16, b| a - b, |a, b| a.wrapping_sub(b), ); } #[test] fn impl_saturating_add_for_u16x16() { let a = u16x16::from([ 1, 2, u16::MAX - 1, u16::MAX - 1, 15, 20, 5000, 2990, 1, 2, u16::MAX - 1, u16::MAX - 1, 15, 20, 5000, 2990, ]); let b = u16x16::from([ 17, 18, 1, 2, 20, 5, 900, 900, 17, 18, 1, 2, 20, 5, 900, 900, ]); let expected = u16x16::from([ 18, 20, u16::MAX, u16::MAX, 35, 25, 5900, 3890, 18, 20, u16::MAX, u16::MAX, 35, 25, 5900, 3890, ]); let actual = a.saturating_add(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x16, b| a.saturating_add(b), |a, b| a.saturating_add(b), ); } #[test] fn impl_saturating_sub_for_u16x16() { let a = u16x16::from([ 1, 2, 1, 0, 15, 20, 5000, 2990, 1, 2, 1, 0, 15, 20, 5000, 2990, ]); let b = u16x16::from([ 17, 18, 1, 1, 20, 5, 900, 900, 17, 18, 1, 1, 20, 5, 900, 900, ]); let expected = u16x16::from([ 0, 0, 0, 0, 0, 15, 4100, 2090, 0, 0, 0, 0, 0, 15, 4100, 2090, ]); let actual = a.saturating_sub(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x16, b| a.saturating_sub(b), |a, b| a.saturating_sub(b), ); } #[test] fn impl_bitand_for_u16x16() { let a = u16x16::from([0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1]); let b = u16x16::from([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]); let expected = u16x16::from([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x16, b| a & b, |a, b| a & b); } #[test] fn impl_bitor_for_u16x16() { let a = u16x16::from([0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1]); let b = u16x16::from([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]); let expected = u16x16::from([0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x16, b| a | b, |a, b| a | b); } #[test] fn impl_bitxor_for_u16x16() { let a = u16x16::from([0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1]); let b = u16x16::from([0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]); let expected = u16x16::from([0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x16, b| a ^ b, |a, b| a ^ b); } #[test] fn impl_shl_for_u16x16() { let a = u16x16::from([ 1, 2, u16::MAX - 1, u16::MAX - 1, 128, 255, 590, 5667, 1, 2, u16::MAX - 1, u16::MAX - 1, 128, 255, 590, 5667, ]); let b = 2; let expected = u16x16::from([ 1 << 2, 2 << 2, (u16::MAX - 1) << 2, (u16::MAX - 1) << 2, 128 << 2, 255 << 2, 590 << 2, 5667 << 2, 1 << 2, 2 << 2, (u16::MAX - 1) << 2, (u16::MAX - 1) << 2, 128 << 2, 255 << 2, 590 << 2, 5667 << 2, ]); let actual = a << b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x16, _b| a << 3, |a, _b| a << 3); } #[test] fn impl_shr_for_u16x16() { let a = u16x16::from([ 1, 2, u16::MAX - 1, u16::MAX - 1, 128, 255, 590, 5667, 1, 2, u16::MAX - 1, u16::MAX - 1, 128, 255, 590, 5667, ]); let b = 2; let expected = u16x16::from([ 1 >> 2, 2 >> 2, (u16::MAX - 1) >> 2, (u16::MAX - 1) >> 2, 128 >> 2, 255 >> 2, 590 >> 2, 5667 >> 2, 1 >> 2, 2 >> 2, (u16::MAX - 1) >> 2, (u16::MAX - 1) >> 2, 128 >> 2, 255 >> 2, 590 >> 2, 5667 >> 2, ]); let actual = a >> b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x16, _b| a >> 3, |a, _b| a >> 3); } #[test] fn impl_u16x16_cmp_eq() { let a = u16x16::from([1, 2, 3, 4, 2, 1, 8, 2, 1, 2, 3, 4, 2, 1, 8, 2]); let b = u16x16::from([2_u16; 16]); let expected = u16x16::from([ 0, u16::MAX, 0, 0, u16::MAX, 0, 0, u16::MAX, 0, u16::MAX, 0, 0, u16::MAX, 0, 0, u16::MAX, ]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_u16x16_blend() { let use_t: u16 = u16::MAX; let t = u16x16::from([1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]); let f = u16x16::from([ 17, 18, 19, 20, 25, 30, 50, 90, 17, 18, 19, 20, 25, 30, 50, 90, ]); let mask = u16x16::from([ use_t, 0, use_t, 0, 0, 0, 0, use_t, use_t, 0, use_t, 0, 0, 0, 0, use_t, ]); let expected = u16x16::from([1, 18, 3, 20, 25, 30, 50, 8, 1, 18, 3, 20, 25, 30, 50, 8]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_u16x16_max() { let a = u16x16::from([u16::MAX, 2, 1, 0, 6, 8, 12, 9, 1, 2, 1, 0, 6, 8, 12, 9]); let b = u16x16::from([17, 0, 1, 1, 19, 0, 0, 0, 17, 0, 1, 1, 19, 0, 0, 0]); let expected = u16x16::from([u16::MAX, 2, 1, 1, 19, 8, 12, 9, 17, 2, 1, 1, 19, 8, 12, 9]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x16, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_u16x16_from_u8x16() { let v = [10u8, 2, 3, 4, 5, 6, 7, 8, 9, 7, 127, 12, 13, 6, 55, 255]; assert_eq!( u16x16::from(v.map(|a| u16::from(a))), u16x16::from(u8x16::from(v)) ); crate::test_random_vector_vs_scalar( |a: u8x16, _b| u16x16::from(a), |a, _b| u16::from(a), ); } #[test] fn impl_u16x16_min() { let a = u16x16::from([1, 2, 1, 0, 6, 8, 12, 9, 1, 2, 1, 0, 6, 8, 12, 9]); let b = u16x16::from([u16::MAX, 0, 1, 1, 19, 0, 0, 0, 17, 0, 1, 1, 19, 0, 0, 0]); let expected = u16x16::from([1, 0, 1, 0, 6, 0, 0, 0, 1, 0, 1, 0, 6, 0, 0, 0]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x16, b| a.min(b), |a, b| a.min(b)); } #[test] fn impl_mul_for_u16x16() { let a = u16x16::from([ 2, 2, i16::MAX as u16, 4, 5, 6, u16::MIN + 1, u16::MIN, 1, 2, i16::MAX as u16, 4, 5, 6, u16::MIN + 1, u16::MIN, ]); let b = u16x16::from([ 17, 18, 190, 20, 21, 22, 1, 1, 17, 18, 190, 20, 21, 22, 1, 1, ]); let expected = u16x16::from([ 2 * 17, 36, (Wrapping(i16::MAX as u16) * Wrapping(190)).0, 80, 105, 132, u16::MIN + 1, u16::MIN, 17, 36, (Wrapping(i16::MAX as u16) * Wrapping(190)).0, 80, 105, 132, u16::MIN + 1, u16::MIN, ]); let actual = a * b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x16, b| a * b, |a, b| a.wrapping_mul(b), ); } #[cfg(feature = "serde")] #[test] fn impl_u16x16_ser_de_roundtrip() { let serialized = bincode::serialize(&u16x16::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(u16x16::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_u16x8.rs000066400000000000000000000165151473735473700175070ustar00rootroot00000000000000use std::num::Wrapping; use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_add_for_u16x8() { let a = u16x8::from([1, 2, 3, 4, 5, 6, u16::MAX - 1, u16::MAX - 1]); let b = u16x8::from([17, 18, 19, 20, 21, 22, 1, 2]); let expected = u16x8::from([18, 20, 22, 24, 26, 28, u16::MAX, 0]); let actual = a + b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x8, b| a + b, |a, b| a.wrapping_add(b), ); } #[test] fn impl_sub_for_u16x8() { let a = u16x8::from([1468, 220, 3, 4456, 5, 6897, 1, 0]); let b = u16x8::from([17, 180, 192, 200, 121, 22, 1, 1]); let expected = u16x8::from([1451, 40, 65347, 4256, 65420, 6875, 0, u16::MAX]); let actual = a - b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x8, b| a - b, |a, b| a.wrapping_sub(b), ); } #[test] fn impl_saturating_add_for_u16x8() { let a = u16x8::from([1, 2, 3, 4, 5, 6, u16::MAX - 1, u16::MAX - 1]); let b = u16x8::from([17, 18, 19, 20, 21, 22, 1, 2]); let expected = u16x8::from([18, 20, 22, 24, 26, 28, u16::MAX, u16::MAX]); let actual = a.saturating_add(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x8, b| a.saturating_add(b), |a, b| a.saturating_add(b), ); } #[test] fn impl_saturating_sub_for_u16x8() { let a = u16x8::from([1468, 220, 3, 4456, 5, 6897, 1, 0]); let b = u16x8::from([17, 180, 192, 200, 121, 22, 1, 1]); let expected = u16x8::from([1451, 40, 0, 4256, 0, 6875, 0, 0]); let actual = a.saturating_sub(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x8, b| a.saturating_sub(b), |a, b| a.saturating_sub(b), ); } #[test] fn impl_mul_for_u16x8() { let a = u16x8::from([1, 2, u16::MAX, 4, 5, 6, u16::MIN + 1, u16::MIN]); let b = u16x8::from([17, 18, 190, 20, 21, 22, 1, 1]); let expected = u16x8::from([ 17, 36, (Wrapping(u16::MAX) * Wrapping(190)).0, 80, 105, 132, u16::MIN + 1, u16::MIN, ]); let actual = a * b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x8, b| a * b, |a, b| a.wrapping_mul(b), ); } #[test] fn impl_bitand_for_u8x16() { let a = u8x16::from([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]); let b = u8x16::from([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); let expected = u8x16::from([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x8, b| a & b, |a, b| a & b); } #[test] fn impl_bitor_for_u8x16() { let a = u8x16::from([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]); let b = u8x16::from([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); let expected = u8x16::from([0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x8, b| a | b, |a, b| a | b); } #[test] fn impl_bitxor_for_u8x16() { let a = u8x16::from([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]); let b = u8x16::from([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); let expected = u8x16::from([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x8, b| a ^ b, |a, b| a ^ b); } #[test] fn impl_shl_for_u16x8() { let a = u16x8::from([1, 2, 3, 4, 5, 6, u16::MAX - 1, u16::MAX - 1]); let b = 2; let expected = u16x8::from([ 1 << 2, 2 << 2, 3 << 2, 4 << 2, 5 << 2, 6 << 2, (u16::MAX - 1) << 2, (u16::MAX - 1) << 2, ]); let actual = a << b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x8, _b| a << 3, |a, _b| a << 3); } #[test] fn impl_shr_for_u16x8() { let a = u16x8::from([1, 2, 3, 4, 5, 6, u16::MAX - 1, u16::MAX - 1]); let b = 2; let expected = u16x8::from([ 1 >> 2, 2 >> 2, 3 >> 2, 4 >> 2, 5 >> 2, 6 >> 2, (u16::MAX - 1) >> 2, (u16::MAX - 1) >> 2, ]); let actual = a >> b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x8, _b| a >> 3, |a, _b| a >> 3); } #[test] fn impl_u16x8_cmp_eq() { let a = u16x8::from([1, 2, 3, 4, 1, 2, 3, 4]); let b = u16x8::from([2_u16; 8]); let expected = u16x8::from([0, u16::MAX, 0, 0, 0, u16::MAX, 0, 0]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_u16x8_blend() { let use_t: u16 = u16::MAX; let t = u16x8::from([1, 2, 3, 4, 5, 6, 7, 8]); let f = u16x8::from([17, 18, 19, 20, 21, 22, 23, 24]); let mask = u16x8::from([use_t, 0, use_t, 0, use_t, 0, use_t, 0]); let expected = u16x8::from([1, 18, 3, 20, 5, 22, 7, 24]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_u16x8_max() { let a = u16x8::from([1, 37001, 3, 4, 5, 6, 7, 8]); let b = u16x8::from([37000, 37000, 19, 20, 2, 2, 2, 24]); let expected = u16x8::from([37000, 37001, 19, 20, 5, 6, 7, 24]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x8, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_u16x8_min() { let a = u16x8::from([1, 37001, 3, 4, 5, 6, 7, 8]); let b = u16x8::from([37000, 37000, 19, 20, 2, 2, 2, 24]); let expected = u16x8::from([1, 37000, 3, 4, 2, 2, 2, 8]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u16x8, b| a.min(b), |a, b| a.min(b)); } #[test] fn impl_u16x8_from_u8x16_low() { let a = u8x16::from([255, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 128]); let expected = u16x8::from([255, 2, 3, 4, 5, 6, 7, 8]); let actual = u16x8::from_u8x16_low(a); assert_eq!(expected, actual); } #[test] fn impl_u16x8_from_u8x16_high() { let a = u8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 128]); let expected = u16x8::from([9, 10, 11, 12, 13, 14, 255, 128]); let actual = u16x8::from_u8x16_high(a); assert_eq!(expected, actual); } #[test] fn impl_u16x8_mul_keep_high() { let a = u16x8::from([u16::MAX, 200, 300, 4568, 1, 2, 3, 200]); let b = u16x8::from([u16::MAX, 600, 700, 8910, 15, 26, 37, 600]); let c: [u16; 8] = u16x8::mul_keep_high(a, b).into(); assert_eq!( c, [ (u32::from(u16::MAX) * u32::from(u16::MAX) >> 16) as u16, 1, 3, 621, 0, 0, 0, 1 ] ); crate::test_random_vector_vs_scalar( |a: u16x8, b| u16x8::mul_keep_high(a, b), |a, b| ((u32::from(a) * u32::from(b)) >> 16) as u16, ); } #[test] fn impl_u16x8_mul_widen() { let a = u16x8::from([1, 2, 3, 4, 5, 6, i16::MAX as u16, u16::MAX]); let b = u16x8::from([17, 18, 190, 20, 21, 22, i16::MAX as u16, u16::MAX]); let expected = u32x8::from([ 17, 36, 570, 80, 105, 132, (i16::MAX as u32) * (i16::MAX as u32), (u16::MAX as u32) * (u16::MAX as u32), ]); let actual = a.mul_widen(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u16x8, b| a.mul_widen(b), |a, b| u32::from(a) * u32::from(b), ); } #[cfg(feature = "serde")] #[test] fn impl_u16x8_ser_de_roundtrip() { let serialized = bincode::serialize(&u16x8::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(u16x8::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_u32x4.rs000066400000000000000000000163271473735473700175020ustar00rootroot00000000000000use std::num::Wrapping; use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_add_for_u32x4() { let a = u32x4::from([1, 2, u32::MAX - 1, u32::MAX - 1]); let b = u32x4::from([17, 18, 1, 2]); let expected = u32x4::from([18, 20, u32::MAX, u32::MIN]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_u32x4() { let a = u32x4::from([9001, 2, 1, 0]); let b = u32x4::from([17, 18, 1, 1]); let expected = u32x4::from([8984, 4294967280, 0, u32::MAX]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_u32x4() { let a = u32x4::from([1, 2, u32::MIN + 1, u32::MAX]); let b = u32x4::from([17, 18, 1, 32]); let expected = u32x4::from([17, 36, 1, (Wrapping(u32::MAX) * Wrapping(32)).0]); let actual = a * b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x4, b| a * b, |a, b| a.wrapping_mul(b), ); } #[test] fn impl_bitand_for_u32x4() { let a = u32x4::from([0, 0, 1, 1]); let b = u32x4::from([0, 1, 0, 1]); let expected = u32x4::from([0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x4, b| a & b, |a, b| a & b); } #[test] fn impl_bitor_for_u32x4() { let a = u32x4::from([0, 0, 1, 1]); let b = u32x4::from([0, 1, 0, 1]); let expected = u32x4::from([0, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x4, b| a | b, |a, b| a | b); } #[test] fn impl_bitxor_for_u32x4() { let a = u32x4::from([0, 0, 1, 1]); let b = u32x4::from([0, 1, 0, 1]); let expected = u32x4::from([0, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x4, b| a ^ b, |a, b| a ^ b); } #[test] fn impl_shl_for_u32x4() { let a = u32x4::from([1, 2, u32::MAX - 1, u32::MAX - 1]); let b = 2; let expected = u32x4::from([1 << 2, 2 << 2, (u32::MAX - 1) << 2, (u32::MAX - 1) << 2]); let actual = a << b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x4, _b| a << 3, |a, _b| a << 3); } #[test] fn impl_shr_for_u32x4() { let a = u32x4::from([1, 2, u32::MAX - 1, u32::MAX - 1]); let b = 2; let expected = u32x4::from([1 >> 2, 2 >> 2, (u32::MAX - 1) >> 2, (u32::MAX - 1) >> 2]); let actual = a >> b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x4, _b| a >> 3, |a, _b| a >> 3); } #[test] fn impl_u32x4_cmp_eq() { let a = u32x4::from([1, 2, 3, 4]); let b = u32x4::from([2_u32; 4]); let expected = u32x4::from([0, u32::MAX, 0, 0]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_u32x4_cmp_gt() { let a = u32x4::from([1, 2, 3, u32::MAX]); let b = u32x4::from([u32::MAX, 2, 2, 2]); let expected = u32x4::from([0, 0, u32::MAX, u32::MAX]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x4, b| a.cmp_gt(b), |a, b| if a > b { u32::MAX } else { 0 }, ); } #[test] fn impl_u32x4_cmp_lt() { let a = u32x4::from([1, 2, 3, u32::MAX]); let b = u32x4::from([u32::MAX, 3, 3, 3]); let expected = u32x4::from([u32::MAX, u32::MAX, 0, 0]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); let expected = u32x4::from([0, 0, 0, 0]); let actual = a.cmp_lt(a); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x4, b| a.cmp_lt(b), |a, b| if a < b { u32::MAX } else { 0 }, ); } #[test] fn impl_u32x4_blend() { let use_t: u32 = u32::MAX; let t = u32x4::from([1, 2, 3, 4]); let f = u32x4::from([17, 18, 19, 20]); let mask = u32x4::from([use_t, 0, use_t, 0]); let expected = u32x4::from([1, 18, 3, 20]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_u32x4_max() { let a = u32x4::from([0, 2, 3, 4]); let b = u32x4::from([17, 1, 0, 20]); let expected = u32x4::from([17, 2, 3, 20]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x4, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_u32x4_min() { let a = u32x4::from([0, 2, 3, 4]); let b = u32x4::from([17, 1, 0, 20]); let expected = u32x4::from([0, 1, 0, 4]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x4, b| a.min(b), |a, b| a.min(b)); } #[test] fn impl_u32x4_not() { let a = u32x4::from([15313, 52322, u32::MAX, 4]); let expected = u32x4::from([4294951982, 4294914973, 0, 4294967291]); let actual = !a; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x4, _b| !a, |a, _b| !a); } #[test] fn impl_u32x4_shr_each() { let a = u32x4::from([15313, 52322, u32::MAX, 4]); let shift = u32x4::from([1, 30, 8, 33 /* test masking behavior */]); let expected = u32x4::from([7656u32, 0, 16777215, 2]); let actual = a >> shift; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x4, b| a >> b, |a, b| a.wrapping_shr(b), ); } #[test] fn impl_u32x4_shl_each() { let a = u32x4::from([15313, 52322, u32::MAX, 4]); let shift = u32x4::from([1, 30, 8, 33 /* test masking behavior */]); let expected = u32x4::from([30626, 2147483648, 4294967040, 8]); let actual = a << shift; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x4, b| a << b, |a, b| a.wrapping_shl(b), ); } #[test] fn test_u32x4_any() { let a = u32x4::from([0, 0, 0, u32::MAX]); assert!(a.any()); // let a = u32x4::from([0, 0, 0, 0]); assert!(!a.any()); } #[test] fn test_u32x4_all() { let a = u32x4::from([0, 0, 0, u32::MAX]); assert!(!a.all()); // let a = u32x4::from([u32::MAX; 4]); assert!(a.all()); } #[test] fn test_u32x4_none() { let a = u32x4::from([0, 0, 0, u32::MAX]); assert!(!a.none()); // let a = u32x4::from([0; 4]); assert!(a.none()); } #[test] fn impl_u32x4_mul_widen() { let a = u32x4::from([1, 2, 3 * 1000000, u32::MAX]); let b = u32x4::from([5, 6, 7 * 1000000, u32::MAX]); let expected = u64x4::from([ 1 * 5, 2 * 6, 3 * 7 * 1000000 * 1000000, u32::MAX as u64 * u32::MAX as u64, ]); let actual = a.mul_widen(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x4, b| a.mul_widen(b), |a, b| u64::from(a) * u64::from(b), ); } #[test] fn impl_u32x4_mul_keep_high() { let mul_high = |a: u32, b: u32| ((u64::from(a) * u64::from(b)) >> 32) as u32; let a = u32x4::from([1, 2 * 10000000, 3 * 1000000, u32::MAX]); let b = u32x4::from([5, 6 * 100, 7 * 1000000, u32::MAX]); let expected = u32x4::from([ mul_high(1, 5), mul_high(2 * 10000000, 6 * 100), mul_high(3 * 1000000, 7 * 1000000), mul_high(u32::MAX, u32::MAX), ]); let actual = a.mul_keep_high(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x4, b| a.mul_keep_high(b), |a, b| ((u64::from(a) * u64::from(b)) >> 32) as u32, ); } #[cfg(feature = "serde")] #[test] fn impl_u32x4_ser_de_roundtrip() { let serialized = bincode::serialize(&u32x4::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(u32x4::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_u32x8.rs000066400000000000000000000175441473735473700175100ustar00rootroot00000000000000use std::num::Wrapping; use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] fn impl_add_for_u32x8() { let a = u32x8::from([1, 2, u32::MAX - 1, u32::MAX - 1, 31, 72, 13, 53]); let b = u32x8::from([17, 18, 1, 2, 12, 12, 634, 15]); let expected = u32x8::from([18, 20, u32::MAX, u32::MIN, 43, 84, 647, 68]); let actual = a + b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x8, b| a + b, |a, b| a.wrapping_add(b), ); } #[test] fn impl_sub_for_u32x8() { let a = u32x8::from([9001, 2, 1, 0, 12, 1, 9, 10]); let b = u32x8::from([17, 18, 1, 1, 15, 1, 2, 5]); let expected = u32x8::from([8984, 4294967280, 0, u32::MAX, 4294967293, 0, 7, 5]); let actual = a - b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x8, b| a - b, |a, b| a.wrapping_sub(b), ); } #[test] fn impl_mul_for_u32x8() { let a = u32x8::from([1, 2, u32::MIN + 1, u32::MAX, 123, u32::MIN, 9, 3802]); let b = u32x8::from([17, 18, 1, 32, 456, 4, 190, 100]); let expected = u32x8::from([ 17, 36, 1, (Wrapping(u32::MAX) * Wrapping(32)).0, 123 * 456, 0, 190 * 9, 380200, ]); let actual = a * b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x8, b| a * b, |a, b| a.wrapping_mul(b), ); } #[test] fn impl_bitand_for_u32x8() { let a = u32x8::from([0, 0, 1, 1, 1, 0, 0, 1]); let b = u32x8::from([0, 1, 0, 1, 0, 1, 1, 1]); let expected = u32x8::from([0, 0, 0, 1, 0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x8, b| a | b, |a, b| a | b); } #[test] fn impl_bitor_for_u32x8() { let a = u32x8::from([0, 0, 1, 1, 1, 0, 0, 1]); let b = u32x8::from([0, 1, 0, 1, 0, 1, 1, 1]); let expected = u32x8::from([0, 1, 1, 1, 1, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x8, b| a & b, |a, b| a & b); } #[test] fn impl_bitxor_for_u32x8() { let a = u32x8::from([0, 0, 1, 1, 1, 0, 0, 1]); let b = u32x8::from([0, 1, 0, 1, 0, 1, 1, 1]); let expected = u32x8::from([0, 1, 1, 0, 1, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x8, b| a ^ b, |a, b| a ^ b); } #[test] fn impl_shl_for_u32x8() { let a = u32x8::from([1, 2, u32::MAX - 1, i32::MAX as u32 - 1, 128, 255, 590, 5667]); let b = 2; let expected = u32x8::from([ 1 << 2, 2 << 2, (u32::MAX - 1) << 2, (i32::MAX as u32 - 1) << 2, 128 << 2, 255 << 2, 590 << 2, 5667 << 2, ]); let actual = a << b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x8, _b| a << 3, |a, _b| a << 3); } #[test] fn impl_shr_for_u32x8() { let a = u32x8::from([1, 2, u32::MAX - 1, i32::MAX as u32 - 1, 128, 255, 590, 5667]); let b = 2; let expected = u32x8::from([ 1 >> 2, 2 >> 2, (u32::MAX - 1) >> 2, (i32::MAX as u32 - 1) >> 2, 128 >> 2, 255 >> 2, 590 >> 2, 5667 >> 2, ]); let actual = a >> b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x8, _b| a >> 3, |a, _b| a >> 3); } #[test] fn impl_u32x8_cmp_eq() { let a = u32x8::from([1, 2, 3, 4, 2, 1, 8, 2]); let b = u32x8::from([2_u32; 8]); let expected = u32x8::from([0, u32::MAX, 0, 0, u32::MAX, 0, 0, u32::MAX]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_u32x8_cmp_gt() { let a = u32x8::from([1, 2, u32::MAX, 4, 1, 2, 8, 10]); let b = u32x8::from([5, 5, 5, 5, 5, 5, 5, 5]); let expected = u32x8::from([0, 0, u32::MAX, 0, 0, 0, u32::MAX, u32::MAX]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x8, b| a.cmp_gt(b), |a, b| if a > b { u32::MAX } else { 0 }, ); } #[test] fn impl_u32x8_cmp_lt() { let a = u32x8::from([5, 5, 5, 5, 5, 5, 5, 5]); let b = u32x8::from([1, 2, u32::MAX, 4, 1, 2, 8, 10]); let expected = u32x8::from([0, 0, u32::MAX, 0, 0, 0, u32::MAX, u32::MAX]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x8, b| a.cmp_lt(b), |a, b| if a < b { u32::MAX } else { 0 }, ); } #[test] fn impl_u32x8_blend() { let use_t: u32 = u32::MAX; let t = u32x8::from([1, 2, 3, 4, 5, 6, 7, 8]); let f = u32x8::from([17, 18, 19, 20, 25, 30, 50, 90]); let mask = u32x8::from([use_t, 0, use_t, 0, 0, 0, 0, use_t]); let expected = u32x8::from([1, 18, 3, 20, 25, 30, 50, 8]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_u32x8_max() { let a = u32x8::from([1, 2, 1, 0, 6, 0, 12, u32::MAX]); let b = u32x8::from([17, 0, 1, 1, 19, 0, 0, 1000]); let expected = u32x8::from([17, 2, 1, 1, 19, 0, 12, u32::MAX]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x8, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_u32x8_min() { let a = u32x8::from([1, 2, 1, 0, 6, 0, 12, u32::MAX]); let b = u32x8::from([17, 0, 1, 1, 19, 0, 0, 1000]); let expected = u32x8::from([1, 0, 1, 0, 6, 0, 0, 1000]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x8, b| a.min(b), |a, b| a.min(b)); } #[test] fn impl_u32x4_shr_each() { let a = u32x8::from([15313, 52322, u32::MAX, 4, 10, 20, 30, 40]); let shift = u32x8::from([1, 30, 8, 33 /* test masking behavior */, 1, 2, 3, 4]); let expected = u32x8::from([7656, 0, 16777215, 2, 5, 5, 3, 2]); let actual = a >> shift; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x8, b| a >> b, |a, b| a.wrapping_shr(b), ); } #[test] fn impl_u32x8_shl_each() { let a = u32x8::from([15313, 52322, u32::MAX, 4, 1, 2, 3, 4]); let shift = u32x8::from([1, 30, 8, 33 /* test masking behavior */, 1, 2, 3, 4]); let expected = u32x8::from([30626, 2147483648, 4294967040, 8, 2, 8, 24, 64]); let actual = a << shift; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u32x8, b| a << b, |a, b| a.wrapping_shl(b), ); } #[test] fn impl_u32x8_not() { let a = u32x8::from([15313, 52322, u32::MAX, 4, 1, 2, 3, 4]); let expected = u32x8::from([ 4294951982, 4294914973, 0, 4294967291, 4294967294, 4294967293, 4294967292, 4294967291, ]); let actual = !a; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u32x8, _b| !a, |a, _b| !a); } #[test] fn impl_u32x8_from_u16x8() { let a = u16x8::from([1, 2, 3, 4, 5, i16::MAX as u16, u16::MAX - 1, u16::MAX]); let actual = u32x8::from(a); let expected = u32x8::from([ 1, 2, 3, 4, 5, i16::MAX as u32, (u16::MAX - 1) as u32, u16::MAX as u32, ]); assert_eq!(actual, expected); crate::test_random_vector_vs_scalar( |a: u16x8, _b| u32x8::from(a), |a, _b| a as u32, ); } #[test] fn test_u32x8_any() { let a = u32x8::from([0, 0, 0, u32::MAX, 0, 0, 0, 0]); assert!(a.any()); // let a = u32x8::from([0, 0, 0, 0, 0, 0, 0, 0]); assert!(!a.any()); } #[test] fn test_u32x8_all() { let a = u32x8::from([0, 0, 0, u32::MAX, 0, 0, 0, 0]); assert!(!a.all()); // let a = u32x8::from([u32::MAX; 8]); assert!(a.all()); } #[test] fn test_u32x8_none() { let a = u32x8::from([0, 0, 0, u32::MAX, 0, 0, 0, 0]); assert!(!a.none()); // let a = u32x8::from([0; 8]); assert!(a.none()); } #[test] fn impl_u32x8_mul_keep_high() { crate::test_random_vector_vs_scalar( |a: u32x8, b| u32x8::mul_keep_high(a, b), |a, b| ((u64::from(a) * u64::from(b)) >> 32) as u32, ); } #[cfg(feature = "serde")] #[test] fn impl_u32x8_ser_de_roundtrip() { let serialized = bincode::serialize(&u32x8::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(u32x8::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_u64x2.rs000066400000000000000000000065131473735473700175010ustar00rootroot00000000000000use std::num::Wrapping; use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_add_for_u64x2() { let a = u64x2::from([u64::MAX - 1, u64::MAX - 1]); let b = u64x2::from([1, 2]); let expected = u64x2::from([u64::MAX, u64::MIN]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_u64x2() { let a = u64x2::from([1, 0]); let b = u64x2::from([1, 1]); let expected = u64x2::from([0, u64::MAX]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_u64x2() { let a = u64x2::from([u64::MIN + 1, u64::MAX]); let b = u64x2::from([2, 2]); let expected = u64x2::from([2, (Wrapping(u64::MAX) * Wrapping(2)).0]); let actual = a * b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u64x2, b| a * b, |a, b| a.wrapping_mul(b), ); } #[test] fn impl_bitand_for_u64x2() { let a = u64x2::from([1, 1]); let b = u64x2::from([0, 1]); let expected = u64x2::from([0, 1]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_u64x2() { let a = u64x2::from([1, 1]); let b = u64x2::from([0, 1]); let expected = u64x2::from([1, 1]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_u64x2() { let a = u64x2::from([1, 1]); let b = u64x2::from([0, 1]); let expected = u64x2::from([1, 0]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_shl_for_u64x2() { let a = u64x2::from([u64::MAX - 1, u64::MAX - 1]); let b = 2; let expected = u64x2::from([(u64::MAX - 1) << 2, (u64::MAX - 1) << 2]); let actual = a << b; assert_eq!(expected, actual); } #[test] fn impl_shr_for_u64x2() { let a = u64x2::from([u64::MAX - 1, u64::MAX - 1]); let b = 2; let expected = u64x2::from([(u64::MAX - 1) >> 2, (u64::MAX - 1) >> 2]); let actual = a >> b; assert_eq!(expected, actual); } #[test] fn impl_u64x2_blend() { let use_t: u64 = u64::MAX; let t = u64x2::from([1, 2]); let f = u64x2::from([17, 18]); let mask = u64x2::from([use_t, 0]); let expected = u64x2::from([1, 18]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_u64x2_cmp_eq() { let a = u64x2::from([1_u64, 4]); let b = u64x2::from([3_u64, 4]); let expected = u64x2::from([0, u64::MAX]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_u64x2_cmp_gt() { let a = u64x2::from([1_u64, 4]); let b = u64x2::from([3_u64, 4]); let expected = u64x2::from([0, 0]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u64x2, b| a.cmp_gt(b), |a, b| if a > b { u64::MAX } else { 0 }, ); } #[test] fn impl_u64x2_cmp_lt() { let a = u64x2::from([3_u64, 4]); let b = u64x2::from([1_u64, 4]); let expected = u64x2::from([0, 0]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u64x2, b| a.cmp_lt(b), |a, b| if a < b { u64::MAX } else { 0 }, ); } #[cfg(feature = "serde")] #[test] fn impl_u64x2_ser_de_roundtrip() { let serialized = bincode::serialize(&u64x2::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(u64x2::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_u64x4.rs000066400000000000000000000070721473735473700175040ustar00rootroot00000000000000use std::num::Wrapping; use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] fn impl_add_for_u64x4() { let a = u64x4::from([u64::MAX - 1, u64::MAX - 1, 6, 9]); let b = u64x4::from([1, 2, 3, 4]); let expected = u64x4::from([u64::MAX, u64::MIN, 9, 13]); let actual = a + b; assert_eq!(expected, actual); } #[test] fn impl_sub_for_u64x4() { let a = u64x4::from([1, 0, 9, 12]); let b = u64x4::from([1, 1, 3, 3]); let expected = u64x4::from([0, u64::MAX, 6, 9]); let actual = a - b; assert_eq!(expected, actual); } #[test] fn impl_mul_for_u64x4() { let a = u64x4::from([u64::MIN + 1, u64::MAX, 30, 70]); let b = u64x4::from([2, 2, 10, 20]); let expected = u64x4::from([2, (Wrapping(u64::MAX) * Wrapping(2)).0, 300, 1400]); let actual = a * b; assert_eq!(expected, actual); } #[test] fn impl_bitand_for_u64x4() { let a = u64x4::from([1, 1, 0, 0]); let b = u64x4::from([0, 1, 0, 1]); let expected = u64x4::from([0, 1, 0, 0]); let actual = a & b; assert_eq!(expected, actual); } #[test] fn impl_bitor_for_u64x4() { let a = u64x4::from([1, 1, 0, 0]); let b = u64x4::from([0, 1, 0, 1]); let expected = u64x4::from([1, 1, 0, 1]); let actual = a | b; assert_eq!(expected, actual); } #[test] fn impl_bitxor_for_u64x4() { let a = u64x4::from([1, 1, 1, 0]); let b = u64x4::from([0, 1, 0, 1]); let expected = u64x4::from([1, 0, 1, 1]); let actual = a ^ b; assert_eq!(expected, actual); } #[test] fn impl_shl_for_u64x4() { let a = u64x4::from([u64::MAX - 1, u64::MAX - 1, 65535, 0]); let b = 2; let expected = u64x4::from([(u64::MAX - 1) << 2, (u64::MAX - 1) << 2, 65535 << 2, 0 << 2]); let actual = a << b; assert_eq!(expected, actual); } #[test] fn impl_shr_for_u64x4() { let a = u64x4::from([u64::MAX - 1, u64::MAX - 1, 65535, 0]); let b = 2; let expected = u64x4::from([(u64::MAX - 1) >> 2, (u64::MAX - 1) >> 2, 65535 >> 2, 0 >> 2]); let actual = a >> b; assert_eq!(expected, actual); } #[test] fn impl_u64x4_blend() { let use_t: u64 = u64::MAX; let t = u64x4::from([1, 2, 3, 4]); let f = u64x4::from([17, 18, 21, 45]); let mask = u64x4::from([use_t, 0, 0, use_t]); let expected = u64x4::from([1, 18, 21, 4]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_u64x4_cmp_eq() { let a = u64x4::from([1_u64, 4, u64::MAX, 5]); let b = u64x4::from([3_u64, 4, u64::MAX, 1]); let expected = u64x4::from([0, u64::MAX, u64::MAX, 0]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_u64x4_cmp_gt() { let a = u64x4::from([1_u64, 4, u64::MAX, 5]); let b = u64x4::from([3_u64, 4, 1, u64::MAX]); let expected = u64x4::from([0, 0, u64::MAX, 0]); let actual = a.cmp_gt(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u64x4, b| a.cmp_gt(b), |a, b| if a > b { u64::MAX } else { 0 }, ); } #[test] fn impl_u64x4_cmp_lt() { let a = u64x4::from([3_u64, 4, 1, u64::MAX]); let b = u64x4::from([1_u64, 4, u64::MAX, 5]); let expected = u64x4::from([0, 0, u64::MAX, 0]); let actual = a.cmp_lt(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u64x4, b| a.cmp_lt(b), |a, b| if a < b { u64::MAX } else { 0 }, ); } #[cfg(feature = "serde")] #[test] fn impl_u64x4_ser_de_roundtrip() { let serialized = bincode::serialize(&u64x4::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(u64x4::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_u8x16.rs000066400000000000000000000143151473735473700175030ustar00rootroot00000000000000use wide::*; #[test] fn size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] fn impl_add_for_u8x16() { let a = u8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 250, 250]); let b = u8x16::from([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 5, 6]); let expected = u8x16::from([ 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 255, 0, ]); let actual = a + b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u8x16, b| a + b, |a, b| a.wrapping_add(b), ); } #[test] fn impl_sub_for_u8x16() { let a = u8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 0]); let b = u8x16::from([170, 18, 10, 200, 241, 2, 93, 4, 12, 8, 27, 28, 29, 30, 1, 1]); let expected = u8x16::from([ 87, 240, 249, 60, 20, 4, 170, 4, 253, 2, 240, 240, 240, 240, 0, u8::MAX, ]); let actual = a - b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u8x16, b| a - b, |a, b| a.wrapping_sub(b), ); } #[test] fn impl_saturating_add_for_u8x16() { let a = u8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 250, 250]); let b = u8x16::from([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 5, 6]); let expected = u8x16::from([ 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 255, 255, ]); let actual = a.saturating_add(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u8x16, b| a.saturating_add(b), |a, b| a.saturating_add(b), ); } #[test] fn impl_saturating_sub_for_u8x16() { let a = u8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1, 0]); let b = u8x16::from([170, 18, 10, 200, 241, 2, 93, 4, 12, 8, 27, 28, 29, 30, 1, 1]); let expected = u8x16::from([0, 0, 0, 0, 0, 4, 0, 4, 0, 2, 0, 0, 0, 0, 0, 0]); let actual = a.saturating_sub(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar( |a: u8x16, b| a.saturating_sub(b), |a, b| a.saturating_sub(b), ); } #[test] fn impl_bitand_for_u8x16() { let a = u8x16::from([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]); let b = u8x16::from([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); let expected = u8x16::from([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]); let actual = a & b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u8x16, b| a & b, |a, b| a & b); } #[test] fn impl_bitor_for_u8x16() { let a = u8x16::from([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]); let b = u8x16::from([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); let expected = u8x16::from([0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1]); let actual = a | b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u8x16, b| a | b, |a, b| a | b); } #[test] fn impl_bitxor_for_u8x16() { let a = u8x16::from([0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]); let b = u8x16::from([0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]); let expected = u8x16::from([0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0]); let actual = a ^ b; assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u8x16, b| a ^ b, |a, b| a ^ b); } #[test] fn impl_u8x16_cmp_eq() { let a = u8x16::from([1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]); let b = u8x16::from([2_u8; 16]); let expected = u8x16::from([ 0, u8::MAX, 0, 0, 0, u8::MAX, 0, 0, 0, u8::MAX, 0, 0, 0, u8::MAX, 0, 0, ]); let actual = a.cmp_eq(b); assert_eq!(expected, actual); } #[test] fn impl_u8x16_blend() { let use_t: u8 = u8::MAX; let t = u8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 126, 127]); let f = u8x16::from([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1, 1]); let mask = u8x16::from([ use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, use_t, 0, ]); let expected = u8x16::from([1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11, 28, 13, 30, 126, 1]); let actual = mask.blend(t, f); assert_eq!(expected, actual); } #[test] fn impl_u8x16_max() { let a = u8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 250, 250]); let b = u8x16::from([17, 18, 19, 20, 2, 2, 2, 24, 25, 26, 27, 28, 29, 30, 5, 6]); let expected = u8x16::from([ 17, 18, 19, 20, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 250, 250, ]); let actual = a.max(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u8x16, b| a.max(b), |a, b| a.max(b)); } #[test] fn impl_u8x16_min() { let a = u8x16::from([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 250, 250]); let b = u8x16::from([17, 18, 19, 20, 2, 2, 2, 24, 25, 26, 27, 28, 29, 30, 5, 6]); let expected = u8x16::from([1, 2, 3, 4, 2, 2, 2, 8, 9, 10, 11, 12, 13, 14, 5, 6]); let actual = a.min(b); assert_eq!(expected, actual); crate::test_random_vector_vs_scalar(|a: u8x16, b| a.min(b), |a, b| a.min(b)); } #[test] fn impl_unpack_low_u8() { let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); let b = u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]); let c: [u8; 16] = u8x16::unpack_low(a, b).into(); assert_eq!(c, [0, 12, 1, 11, 2, 22, 3, 13, 4, 99, 5, 15, 6, 16, 7, 17]); } #[test] fn impl_unpack_high_u8() { let a = u8x16::from([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); let b = u8x16::from([12, 11, 22, 13, 99, 15, 16, 17, 8, 19, 2, 21, 22, 3, 24, 127]); let c: [u8; 16] = u8x16::unpack_high(a, b).into(); assert_eq!(c, [8, 8, 9, 19, 10, 2, 11, 21, 12, 22, 13, 3, 14, 24, 15, 127]); } #[test] fn impl_narrow_i16x8() { let a = i16x8::from([-1, 2, -3, 4, -5, 6, -7, 8]); let b = i16x8::from([9, 10, 11, 12, 13, -14, 15, -16]); let c: [u8; 16] = u8x16::narrow_i16x8(a, b).into(); assert_eq!(c, [0, 2, 0, 4, 0, 6, 0, 8, 9, 10, 11, 12, 13, 0, 15, 0]); } #[cfg(feature = "serde")] #[test] fn impl_u8x16_ser_de_roundtrip() { let serialized = bincode::serialize(&u8x16::ZERO).expect("serialization failed"); let deserialized = bincode::deserialize(&serialized).expect("deserializaion failed"); assert_eq!(u8x16::ZERO, deserialized); } wide-0.7.32/tests/all_tests/t_usefulness.rs000066400000000000000000000324621473735473700210070ustar00rootroot00000000000000#![allow(clippy::excessive_precision)] use wide::*; use bytemuck::*; #[test] fn unpack_modify_and_repack_rgba_values() { let mask = u32x4::from(0xFF); // let input = u32x4::from([0xFF0000FF, 0x00FF00FF, 0x0000FFFF, 0x000000FF]); // unpack let r_actual = cast::<_, i32x4>(input >> 24).round_float(); let g_actual = cast::<_, i32x4>((input >> 16) & mask).round_float(); let b_actual = cast::<_, i32x4>((input >> 8) & mask).round_float(); let a_actual = cast::<_, i32x4>(input & mask).round_float(); let r_expected = f32x4::from([255.0, 0.0, 0.0, 0.0]); let g_expected = f32x4::from([0.0, 255.0, 0.0, 0.0]); let b_expected = f32x4::from([0.0, 0.0, 255.0, 0.0]); let a_expected = f32x4::from([255.0, 255.0, 255.0, 255.0]); assert_eq!(r_expected, r_actual); assert_eq!(g_expected, g_actual); assert_eq!(b_expected, b_actual); assert_eq!(a_expected, a_actual); // modify some of the data let r_new = (r_actual - f32x4::from(1.0)).max(f32x4::from(0.0)); let g_new = (g_actual - f32x4::from(1.0)).max(f32x4::from(0.0)); let b_new = (b_actual - f32x4::from(1.0)).max(f32x4::from(0.0)); let a_new = a_actual; // repack let r_u = cast::(r_new.round_int()); let g_u = cast::(g_new.round_int()); let b_u = cast::(b_new.round_int()); let a_u = cast::(a_new.round_int()); let output_actual = (r_u << 24) | (g_u << 16) | (b_u << 8) | (a_u); let output_expected = u32x4::from([0xFE0000FF, 0x00FE00FF, 0x0000FEFF, 0x000000FF]); assert_eq!(output_expected, output_actual); } /// Implement JPEG IDCT using i16x8. This has slightly different behavior than /// the normal 32 bit scalar implementation in libjpeg. It's a bit more accurate /// in some ways (since the constants are encoded in 15 bits instead of 12) but /// is more subject to hitting saturation during intermediate calculations, /// although that should normally not be a problem for photographic JPEGs. /// /// The main downside of this approach is that it is very slow to do saturating /// math on scalar types on some CPUs, so if you need bit-exact behavior on /// different architectures this is not the algorithm for you. #[test] fn test_dequantize_and_idct_i16() { fn to_fixed(x: f32) -> i16 { (x * 32767.0 + 0.5) as i16 } fn kernel_i16(data: [i16x8; 8]) -> [i16x8; 8] { // kernel x let a2 = data[2]; let a6 = data[6]; let b0 = a2.saturating_add(a6).mul_scale_round_n(to_fixed(0.5411961)); let c0 = b0 .saturating_sub(a6) .saturating_sub(a6.mul_scale_round_n(to_fixed(0.847759065))); let c1 = b0.saturating_add(a2.mul_scale_round_n(to_fixed(0.765366865))); let a0 = data[0]; let a4 = data[4]; let b1 = a0.saturating_add(a4); let b2 = a0.saturating_sub(a4); let x0 = b1.saturating_add(c1); let x1 = b2.saturating_add(c0); let x2 = b2.saturating_sub(c0); let x3 = b1.saturating_sub(c1); // kernel t let t0 = data[7]; let t1 = data[5]; let t2 = data[3]; let t3 = data[1]; let p1 = t0.saturating_add(t3); let p2 = t1.saturating_add(t2); let p3 = t0.saturating_add(t2); let p4 = t1.saturating_add(t3); let p5t = p3.saturating_add(p4); let p5 = p5t.saturating_add(p5t.mul_scale_round_n(to_fixed(0.175875602))); let e0 = t0.mul_scale_round_n(to_fixed(0.298631336)); let e1 = t1 .saturating_add(t1) .saturating_add(t1.mul_scale_round_n(to_fixed(0.053119869))); let e2 = t2 .saturating_add(t2) .saturating_add(t2) .saturating_add(t2.mul_scale_round_n(to_fixed(0.072711026))); let e3 = t3.saturating_add(t3.mul_scale_round_n(to_fixed(0.501321110))); let f0 = p5.saturating_sub(p1.mul_scale_round_n(to_fixed(0.899976223))); let f1 = p5 .saturating_sub(p2) .saturating_sub(p2) .saturating_sub(p2.mul_scale_round_n(to_fixed(0.562915447))); let f2 = p3.mul_scale_round_n(to_fixed(-0.961570560)).saturating_sub(p3); let f3 = p4.mul_scale_round_n(to_fixed(-0.390180644)); let t3 = f0.saturating_add(f3).saturating_add(e3); let t2 = f1.saturating_add(f2).saturating_add(e2); let t1 = f1.saturating_add(f3).saturating_add(e1); let t0 = f0.saturating_add(f2).saturating_add(e0); [ x0.saturating_add(t3), x1.saturating_add(t2), x2.saturating_add(t1), x3.saturating_add(t0), x3.saturating_sub(t0), x2.saturating_sub(t1), x1.saturating_sub(t2), x0.saturating_sub(t3), ] } #[rustfmt::skip] let coefficients: [i16; 8 * 8] = [ -14, -39, 58, -2, 3, 3, 0, 1, 11, 27, 4, -3, 3, 0, 1, 0, -6, -13, -9, -1, -2, -1, 0, 0, -4, 0, -1, -2, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, -3, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; #[rustfmt::skip] let quantization_table: [i16; 8 * 8] = [ 8, 6, 5, 8, 12, 20, 26, 31, 6, 6, 7, 10, 13, 29, 30, 28, 7, 7, 8, 12, 20, 29, 35, 28, 7, 9, 11, 15, 26, 44, 40, 31, 9, 11, 19, 28, 34, 55, 52, 39, 12, 18, 28, 32, 41, 52, 57, 46, 25, 32, 39, 44, 52, 61, 60, 51, 36, 46, 48, 49, 56, 50, 52, 50 ]; let c: [i16x8; 8] = cast(coefficients); let q: [i16x8; 8] = cast(quantization_table); // coefficients normally go up to 1024, shift up by 3 to get extra precision const SHIFT: i16 = 3; let data = [ (c[0] * q[0]) << SHIFT, (c[1] * q[1]) << SHIFT, (c[2] * q[2]) << SHIFT, (c[3] * q[3]) << SHIFT, (c[4] * q[4]) << SHIFT, (c[5] * q[5]) << SHIFT, (c[6] * q[6]) << SHIFT, (c[7] * q[7]) << SHIFT, ]; let pass1 = kernel_i16(data); let transpose1 = i16x8::transpose(pass1); let pass2 = kernel_i16(transpose1); let result = i16x8::transpose(pass2); // offset to recenter to 0..256 and round properly const ROUND_FACTOR: i16 = 0x2020; let round_factor = i16x8::splat(ROUND_FACTOR); let result_adj = [ result[0].saturating_add(round_factor) >> (2 * SHIFT), result[1].saturating_add(round_factor) >> (2 * SHIFT), result[2].saturating_add(round_factor) >> (2 * SHIFT), result[3].saturating_add(round_factor) >> (2 * SHIFT), result[4].saturating_add(round_factor) >> (2 * SHIFT), result[5].saturating_add(round_factor) >> (2 * SHIFT), result[6].saturating_add(round_factor) >> (2 * SHIFT), result[7].saturating_add(round_factor) >> (2 * SHIFT), ]; let output: [i16; 64] = cast(result_adj); #[rustfmt::skip] let expected_output = [ 118, 92, 110, 83, 77, 93, 144, 198, 172, 116, 114, 87, 78, 93, 146, 191, 194, 107, 91, 76, 71, 93, 160, 198, 196, 100, 80, 74, 67, 92, 174, 209, 182, 104, 88, 81, 68, 89, 178, 206, 105, 64, 59, 59, 63, 94, 183, 201, 35, 27, 28, 37, 72, 121, 203, 204, 38, 45, 41, 47, 99, 154, 223, 208 ]; assert_eq!(expected_output, output); } /// Implement JPEG IDCT using i32x8. This is most similar to the scalar /// libjpeg version which has slightly different rounding propertis than the 16 /// bit version. Some decoders are forced to use this if they want bit-by-bit /// compatibility across all architectures. #[test] fn test_dequantize_and_idct_i32() { fn to_fixed(x: f32) -> i32 { (x * 4096.0 + 0.5) as i32 } fn kernel_i32( [s0, s1, s2, s3, s4, s5, s6, s7]: [i32x8; 8], rounding_factor: i32, shift_right: i32, ) -> [i32x8; 8] { // kernel x let at = (s2 + s6) * to_fixed(0.5411961); let a0 = (s0 + s4) << 12; // multiply by 1, ie 4096 in fixed point) let a1 = (s0 - s4) << 12; // multiply by 1, ie 4096 in fixed point) let a2 = at + s6 * to_fixed(-1.847759065); let a3 = at + s2 * to_fixed(0.765366865); let x0 = a0 + a3 + rounding_factor; // add rounding factor here to avoid extra addition let x1 = a1 + a2 + rounding_factor; let x2 = a1 - a2 + rounding_factor; let x3 = a0 - a3 + rounding_factor; // kernel t let b0 = s7 + s1; let b1 = s5 + s3; let b2 = s7 + s3; let b3 = s5 + s1; let ct = (b2 + b3) * to_fixed(1.175875602); let c0 = ct + b0 * to_fixed(-0.899976223); let c1 = ct + b1 * to_fixed(-2.562915447); let c2 = b2 * to_fixed(-1.961570560); let c3 = b3 * to_fixed(-0.390180644); let t0 = s7 * to_fixed(0.298631336) + c0 + c2; let t1 = s5 * to_fixed(2.053119869) + c1 + c3; let t2 = s3 * to_fixed(3.072711026) + c1 + c2; let t3 = s1 * to_fixed(1.501321110) + c0 + c3; [ (x0 + t3) >> shift_right, (x1 + t2) >> shift_right, (x2 + t1) >> shift_right, (x3 + t0) >> shift_right, (x3 - t0) >> shift_right, (x2 - t1) >> shift_right, (x1 - t2) >> shift_right, (x0 - t3) >> shift_right, ] } #[rustfmt::skip] let coefficients: [i32; 8 * 8] = [ -14, -39, 58, -2, 3, 3, 0, 1, 11, 27, 4, -3, 3, 0, 1, 0, -6, -13, -9, -1, -2, -1, 0, 0, -4, 0, -1, -2, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, -3, -2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]; #[rustfmt::skip] let quantization_table: [i32; 8 * 8] = [ 8, 6, 5, 8, 12, 20, 26, 31, 6, 6, 7, 10, 13, 29, 30, 28, 7, 7, 8, 12, 20, 29, 35, 28, 7, 9, 11, 15, 26, 44, 40, 31, 9, 11, 19, 28, 34, 55, 52, 39, 12, 18, 28, 32, 41, 52, 57, 46, 25, 32, 39, 44, 52, 61, 60, 51, 36, 46, 48, 49, 56, 50, 52, 50 ]; let c: [i32x8; 8] = cast(coefficients); let q: [i32x8; 8] = cast(quantization_table); let scaled = [ c[0] * q[0], c[1] * q[1], c[2] * q[2], c[3] * q[3], c[4] * q[4], c[5] * q[5], c[6] * q[6], c[7] * q[7], ]; // add rounding factor before shifting right let pass1 = kernel_i32(scaled, 1 << 9, 10); let transpose1 = i32x8::transpose(pass1); // add rounding factor before shifting right (include rebasing from -128..128 // to 0..256) let pass2 = kernel_i32(transpose1, 65536 + (128 << 17), 17); let result = i32x8::transpose(pass2); let output: [i32; 64] = cast(result); // same as other DCT test with some minor rounding differences #[rustfmt::skip] let expected_output = [ 118, 92, 110, 83, 77, 93, 144, 198, 172, 116, 114, 87, 78, 93, 146, 191, 194, 107, 91, 76, 71, 93, 160, 198, 196, 100, 80, 74, 67, 92, 174, 209, 182, 104, 88, 81, 68, 89, 178, 206, 105, 64, 59, 59, 63, 94, 183, 201, 35, 27, 28, 37, 72, 121, 203, 204, 37, 45, 41, 47, 98, 154, 223, 208]; assert_eq!(expected_output, output); } // Example implementation of a branch-free division algorithm using u32x8. /// Ported from libdivide. Example to show how to use the branchfree division /// with this library. fn internal_gen_branch_free_u32(d: u32) -> (u32, u32) { fn div_rem(a: u64, b: u64) -> (u64, u64) { (a / b, a % b) } // branchfree cannot be one or zero assert!(d > 1); let floor_log_2_d = (32u32 - 1) - d.leading_zeros(); // Power of 2 if (d & (d - 1)) == 0 { // We need to subtract 1 from the shift value in case of an unsigned // branchfree divider because there is a hardcoded right shift by 1 // in its division algorithm. Because of this we also need to add back // 1 in its recovery algorithm. (0, floor_log_2_d - 1) } else { let (proposed_m, rem) = div_rem(1u64 << (floor_log_2_d + 32), d as u64); let mut proposed_m = proposed_m as u32; let rem = rem as u32; assert!(rem > 0 && rem < d); // This power works if e < 2**floor_log_2_d. // We have to use the general 33-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and // its remainder. By doubling both, and then correcting the // remainder, we can compute the larger division. // don't care about overflow here - in fact, we expect it proposed_m = proposed_m.wrapping_add(proposed_m); let twice_rem = rem.wrapping_add(rem); if twice_rem >= d || twice_rem < rem { proposed_m += 1; } (1 + proposed_m, floor_log_2_d) // result.more's shift should in general be ceil_log_2_d. But if we // used the smaller power, we subtract one from the shift because we're // using the smaller power. If we're using the larger power, we // subtract one from the shift because it's taken care of by the add // indicator. So floor_log_2_d happens to be correct in both cases. } } /// Generate magic and shift values for branch-free division. fn generate_branch_free_divide_magic_shift(denom: u32x8) -> (u32x8, u32x8) { let mut magic = u32x8::ZERO; let mut shift = u32x8::ZERO; for i in 0..magic.as_array_ref().len() { let (m, s) = internal_gen_branch_free_u32(denom.as_array_ref()[i]); magic.as_array_mut()[i] = m; shift.as_array_mut()[i] = s; } (magic, shift) } // using the previously generated magic and shift, calculate the division fn branch_free_divide(numerator: u32x8, magic: u32x8, shift: u32x8) -> u32x8 { let q = u32x8::mul_keep_high(numerator, magic); let t = ((numerator - q) >> 1) + q; t >> shift } #[test] fn impl_u32x8_branch_free_divide() { crate::test_random_vector_vs_scalar( |a: u32x8, b| { // never divide by 0 or 1 (since the branch free division doesn't support // division by 1) let b = b.max(u32x8::splat(2)); let (magic, shift) = generate_branch_free_divide_magic_shift(b); branch_free_divide(a, magic, shift) }, |a, b| a / b.max(2), ); }