pax_global_header00006660000000000000000000000064144552620040014514gustar00rootroot0000000000000052 comment=46cd2e782f75971440c57ce43f8a0a1d2127b67d safe_arch-0.7.1/000077500000000000000000000000001445526200400134345ustar00rootroot00000000000000safe_arch-0.7.1/.cargo/000077500000000000000000000000001445526200400146055ustar00rootroot00000000000000safe_arch-0.7.1/.cargo/config.toml000066400000000000000000000005371445526200400167540ustar00rootroot00000000000000 # Note: Cargo doesn't carry these settings to dependencies. They only affect the # process of directly building the crate. This is so that we can easily use # `cargo test` and `cargo doc` and so on during development. [build] # This can cause weirdness if they don't match! rustflags = ["-Ctarget-cpu=native"] rustdocflags = ["-Ctarget-cpu=native"] safe_arch-0.7.1/.github/000077500000000000000000000000001445526200400147745ustar00rootroot00000000000000safe_arch-0.7.1/.github/workflows/000077500000000000000000000000001445526200400170315ustar00rootroot00000000000000safe_arch-0.7.1/.github/workflows/rust.yml000066400000000000000000000031701445526200400205520ustar00rootroot00000000000000name: Rust on: push: {} pull_request: {} schedule: # Min Hr Day Month Weekday; so this should be 1:05am each day. - cron: '5 1 * * *' jobs: build_test: runs-on: windows-latest strategy: matrix: rust: # x86 without sse/sse2 on by default - { target: i586-pc-windows-msvc, toolchain: 1.51.0 } - { target: i586-pc-windows-msvc, toolchain: stable } - { target: i586-pc-windows-msvc, toolchain: beta } - { target: i586-pc-windows-msvc, toolchain: nightly } # x86 - { target: i686-pc-windows-msvc, toolchain: 1.51.0 } - { target: i686-pc-windows-msvc, toolchain: stable } - { target: i686-pc-windows-msvc, toolchain: beta } - { target: i686-pc-windows-msvc, toolchain: nightly } # x86_64 - { target: x86_64-pc-windows-msvc, toolchain: 1.51.0 } - { target: x86_64-pc-windows-msvc, toolchain: stable } - { target: x86_64-pc-windows-msvc, toolchain: beta } - { target: x86_64-pc-windows-msvc, toolchain: nightly } steps: - uses: actions/checkout@v1 - uses: actions-rs/toolchain@v1 with: toolchain: ${{ matrix.rust.toolchain }} target: ${{ matrix.rust.target }} profile: minimal default: true - name: suppress target-cpu=native on i586 if: matrix.rust.target == 'i586-pc-windows-msvc' run: rm .cargo/config.toml - name: Run tests with default features run: cargo test --target ${{ matrix.rust.target }} - name: Run tests with all features run: cargo test --target ${{ matrix.rust.target }} --all-features safe_arch-0.7.1/.gitignore000066400000000000000000000000231445526200400154170ustar00rootroot00000000000000/target Cargo.lock safe_arch-0.7.1/Cargo.toml000066400000000000000000000013631445526200400153670ustar00rootroot00000000000000[package] name = "safe_arch" description = "Crate that exposes `core::arch` safely via `#[cfg()]`." repository = "https://github.com/Lokathor/safe_arch" version = "0.7.1" authors = ["Lokathor "] edition = "2018" license = "Zlib OR Apache-2.0 OR MIT" categories = ["api-bindings", "hardware-support", "no-std"] keywords = ["intrinsics", "simd"] [dependencies] # If enabled, gives bytemuck trait impls for our types bytemuck = {version = "1.2", optional = true} [features] default = [] [profile.test] opt-level = 3 [package.metadata.docs.rs] # This makes docs.rs build with all features enabled and # also it will indicate what fn needs what cpu feature. rustdoc-args = ["-C","target-cpu=native","--cfg","docs_rs"] all-features = true safe_arch-0.7.1/LICENSE-APACHE.md000066400000000000000000000242361445526200400157660ustar00rootroot00000000000000Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. safe_arch-0.7.1/LICENSE-MIT.md000066400000000000000000000021261445526200400154700ustar00rootroot00000000000000MIT License Copyright (c) 2023 Daniel "Lokathor" Gee. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice (including the next paragraph) shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. safe_arch-0.7.1/LICENSE-ZLIB.md000066400000000000000000000015231445526200400155770ustar00rootroot00000000000000Copyright (c) 2020 Daniel "Lokathor" Gee. This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. safe_arch-0.7.1/README.md000066400000000000000000000034701445526200400147170ustar00rootroot00000000000000# [Docs.rs](https://docs.rs/safe_arch) # safe_arch Exposes arch-specific intrinsics as safe function. * SIMD types are newtype'd (with a `pub` field) and given appropriate trait impls such as `From`, `Into`, `Default`, etc. * Each intrinsic gets either a function or macro so that you can safely use it as directly as possible. * Functions are used when all arguments are runtime arguments. * Macros are used when one of the arguments must be a compile time constant, because Rust doesn't let you "pass through" compile time constants. * There's hundreds and hundreds of intrinsics, so the names of functions and macros tend to be very long and specific because there's often many similar ways to do nearly the same thing. * This crate isn't really intended for "everyday users". It is intended to be an "unopinionated" middle layer crate that just provides the safety. Higher level abstractions should mostly come from some other crate that wraps over this crate. All function and macro availability is done purely at compile time via `#[cfg()]` attributes on the various modules. If a CPU feature isn't enabled for the build then those functions or macros won't be available. If you'd like to determine what CPU features are available at runtime and then call different code accordingly, this crate is not for you. See the [crate docs](https://docs.rs/safe_arch) for more details. ## Additional Resources * [Intel Intrinsics Guide](https://software.intel.com/sites/landingpage/IntrinsicsGuide/) * [Raw Xml v3.5.2](https://software.intel.com/sites/landingpage/IntrinsicsGuide/files/data-3.5.2.xml) and you can check their [release notes](https://software.intel.com/sites/landingpage/IntrinsicsGuide/files/ReleaseNotes.html) to see if a later version has been put out since this readme file was last updated. safe_arch-0.7.1/rustfmt.toml000066400000000000000000000004161445526200400160360ustar00rootroot00000000000000# Stable edition = "2018" fn_args_layout = "Compressed" max_width = 9000 tab_spaces = 2 use_field_init_shorthand = true use_try_shorthand = true use_small_heuristics = "Max" # Unstable format_code_in_doc_comments = true imports_granularity="Crate" wrap_comments = true safe_arch-0.7.1/scripts/000077500000000000000000000000001445526200400151235ustar00rootroot00000000000000safe_arch-0.7.1/scripts/intel-helper.py000077500000000000000000000060251445526200400200730ustar00rootroot00000000000000#!/usr/bin/python import sys import xml.etree.ElementTree as xml SKIPZERO = True COLORS = True """ A script used to filter intel intrinsics XML and print out instructions related with them In case of problems with ansi colors on Windows, switch COLORS to False Also, it uses \b character hack to "truncate" strings (while printing \b moves cursor one character back, so it appears as truncated) Usage: ./intel-helper.py TECH? Examples: ./intel-helper.py AVX2 # filters avx2 instructions ./intel-helper.py # prints all intrinsics Setting SKIPZERO means this script won't print intrinsics with no assembly instructions related with them. This may happen because the XML may not be 100% completed, but also most of casts just change type and do not touch the bits Link to the XML: https://software.intel.com/sites/landingpage/IntrinsicsGuide/files/data-3.5.0.xml (beware, for a text file, its huge) """ CBLUE = '' CGREEN = '' CEND = '' CBEIGE = '' if COLORS: CBLUE = '\033[94m' CGREEN = '\033[92m' CEND = '\x1b[0m' CBEIGE = '\33[36m' tech = '' filename = '' if len(sys.argv) >= 2: filename = sys.argv[1] else: print("Not enough arguments") exit(1) def doc_helper(args, tree): tech = '' if len(args) != 0: tech = args[0] for child in tree.getroot(): if len(tech) != 0 and child.attrib["tech"] != tech: continue name = child.attrib["name"] instructions = '' for instruction in child.findall("instruction"): iname = instruction.attrib["name"].lower() iform = instruction.get("form") if iform is None: instructions += f"`{iname}`, " else: instructions += f"`{iname} {iform}`, " print(f"/// * **Intrinsic:** [`{name}`]") print(f"/// * **Assembly:** {instructions[:-2]}", end='\n\n') def pretty_print_intrinsics_and_instructions(args, tree): tech = '' if len(args) != 0: tech = args[0] for child in tree.getroot(): if len(tech) != 0 and child.attrib["tech"] != tech: continue instructions = child.findall("instruction") instr_no = len(instructions) intrinsic_name = child.attrib["name"] intrinsic_name += '(' for arg in child.findall("parameter"): typ = arg.attrib["type"] name = arg.attrib.get("varname", "\b") intrinsic_name += f"{CBEIGE}{typ}{CBLUE} {name}, " intrinsic_name = intrinsic_name[:-2] + ')' if instr_no == 0 and SKIPZERO: continue elif instr_no == 1: print(f"{CBLUE}{intrinsic_name}{CEND}", end=': ') else: print(f"{CBLUE}{intrinsic_name}{CEND}") for instr in instructions: attribs = instr.attrib name = attribs["name"].lower() form = attribs.get("form", "\b") print(f"{CGREEN}{name}{CEND} {form}") doc_helper(sys.argv[2:], xml.parse(filename)) #pretty_print_intrinsics_and_instructions(sys.argv[2:], xml.parse(filename)) safe_arch-0.7.1/src/000077500000000000000000000000001445526200400142235ustar00rootroot00000000000000safe_arch-0.7.1/src/lib.rs000066400000000000000000000326461445526200400153520ustar00rootroot00000000000000#![no_std] #![warn(missing_docs)] #![allow(unused_imports)] #![allow(clippy::too_many_arguments)] #![allow(clippy::transmute_ptr_to_ptr)] #![cfg_attr(docs_rs, feature(doc_cfg))] //! A crate that safely exposes arch intrinsics via `#[cfg()]`. //! //! `safe_arch` lets you safely use CPU intrinsics. Those things in the //! [`core::arch`](core::arch) modules. It works purely via `#[cfg()]` and //! compile time CPU feature declaration. If you want to check for a feature at //! runtime and then call an intrinsic or use a fallback path based on that then //! this crate is sadly not for you. //! //! SIMD register types are "newtype'd" so that better trait impls can be given //! to them, but the inner value is a `pub` field so feel free to just grab it //! out if you need to. Trait impls of the newtypes include: `Default` (zeroed), //! `From`/`Into` of appropriate data types, and appropriate operator //! overloading. //! //! * Most intrinsics (like addition and multiplication) are totally safe to use //! as long as the CPU feature is available. In this case, what you get is 1:1 //! with the actual intrinsic. //! * Some intrinsics take a pointer of an assumed minimum alignment and //! validity span. For these, the `safe_arch` function takes a reference of an //! appropriate type to uphold safety. //! * Try the [bytemuck](https://docs.rs/bytemuck) crate (and turn on the //! `bytemuck` feature of this crate) if you want help safely casting //! between reference types. //! * Some intrinsics are not safe unless you're _very_ careful about how you //! use them, such as the streaming operations requiring you to use them in //! combination with an appropriate memory fence. Those operations aren't //! exposed here. //! * Some intrinsics mess with the processor state, such as changing the //! floating point flags, saving and loading special register state, and so //! on. LLVM doesn't really support you messing with that within a high level //! language, so those operations aren't exposed here. Use assembly or //! something if you want to do that. //! //! ## Naming Conventions //! The `safe_arch` crate does not simply use the "official" names for each //! intrinsic, because the official names are generally poor. Instead, the //! operations have been given better names that makes things hopefully easier //! to understand then you're reading the code. //! //! For a full explanation of the naming used, see the [Naming //! Conventions](crate::naming_conventions) page. //! //! ## Current Support //! * `x86` / `x86_64` (Intel, AMD, etc) //! * 128-bit: `sse`, `sse2`, `sse3`, `ssse3`, `sse4.1`, `sse4.2` //! * 256-bit: `avx`, `avx2` //! * Other: `adx`, `aes`, `bmi1`, `bmi2`, `fma`, `lzcnt`, `pclmulqdq`, //! `popcnt`, `rdrand`, `rdseed` //! //! ## Compile Time CPU Target Features //! //! At the time of me writing this, Rust enables the `sse` and `sse2` CPU //! features by default for all `i686` (x86) and `x86_64` builds. Those CPU //! features are built into the design of `x86_64`, and you'd need a _super_ old //! `x86` CPU for it to not support at least `sse` and `sse2`, so they're a safe //! bet for the language to enable all the time. In fact, because the standard //! library is compiled with them enabled, simply trying to _disable_ those //! features would actually cause ABI issues and fill your program with UB //! ([link][rustc_docs]). //! //! If you want additional CPU features available at compile time you'll have to //! enable them with an additional arg to `rustc`. For a feature named `name` //! you pass `-C target-feature=+name`, such as `-C target-feature=+sse3` for //! `sse3`. //! //! You can alternately enable _all_ target features of the current CPU with `-C //! target-cpu=native`. This is primarily of use if you're building a program //! you'll only run on your own system. //! //! It's sometimes hard to know if your target platform will support a given //! feature set, but the [Steam Hardware Survey][steam-survey] is generally //! taken as a guide to what you can expect people to have available. If you //! click "Other Settings" it'll expand into a list of CPU target features and //! how common they are. These days, it seems that `sse3` can be safely assumed, //! and `ssse3`, `sse4.1`, and `sse4.2` are pretty safe bets as well. The stuff //! above 128-bit isn't as common yet, give it another few years. //! //! **Please note that executing a program on a CPU that doesn't support the //! target features it was compiles for is Undefined Behavior.** //! //! Currently, Rust doesn't actually support an easy way for you to check that a //! feature enabled at compile time is _actually_ available at runtime. There is //! the "[feature_detected][feature_detected]" family of macros, but if you //! enable a feature they will evaluate to a constant `true` instead of actually //! deferring the check for the feature to runtime. This means that, if you //! _did_ want a check at the start of your program, to confirm that all the //! assumed features are present and error out when the assumptions don't hold, //! you can't use that macro. You gotta use CPUID and check manually. rip. //! Hopefully we can make that process easier in a future version of this crate. //! //! [steam-survey]: //! https://store.steampowered.com/hwsurvey/Steam-Hardware-Software-Survey-Welcome-to-Steam //! [feature_detected]: //! https://doc.rust-lang.org/std/index.html?search=feature_detected //! [rustc_docs]: https://doc.rust-lang.org/rustc/targets/known-issues.html //! //! ### A Note On Working With Cfg //! //! There's two main ways to use `cfg`: //! * Via an attribute placed on an item, block, or expression: //! * `#[cfg(debug_assertions)] println!("hello");` //! * Via a macro used within an expression position: //! * `if cfg!(debug_assertions) { println!("hello"); }` //! //! The difference might seem small but it's actually very important: //! * The attribute form will include code or not _before_ deciding if all the //! items named and so forth really exist or not. This means that code that is //! configured via attribute can safely name things that don't always exist as //! long as the things they name do exist whenever that code is configured //! into the build. //! * The macro form will include the configured code _no matter what_, and then //! the macro resolves to a constant `true` or `false` and the compiler uses //! dead code elimination to cut out the path not taken. //! //! This crate uses `cfg` via the attribute, so the functions it exposes don't //! exist at all when the appropriate CPU target features aren't enabled. //! Accordingly, if you plan to call this crate or not depending on what //! features are enabled in the build you'll also need to control your use of //! this crate via cfg attribute, not cfg macro. use core::{ convert::AsRef, fmt::{Binary, Debug, Display, LowerExp, LowerHex, Octal, UpperExp, UpperHex}, ops::{Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Div, DivAssign, Mul, MulAssign, Neg, Not, Sub, SubAssign}, }; pub mod naming_conventions; /// Declares a private mod and then a glob `use` with the visibility specified. macro_rules! submodule { ($v:vis $name:ident) => { mod $name; $v use $name::*; }; ($v:vis $name:ident { $($content:tt)* }) => { mod $name { $($content)* } $v use $name::*; }; } // Note(Lokathor): Stupid as it sounds, we need to put the imports here at the // crate root because the arch-specific macros that we define in our inner // modules are actually "scoped" to also be at the crate root. We want the // rustdoc generation of the macros to "see" these imports so that the docs link // over to the `core::arch` module correctly. // https://github.com/rust-lang/rust/issues/72243 #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] submodule!(pub x86_x64 { //! Types and functions for safe `x86` / `x86_64` intrinsic usage. //! //! `x86_64` is essentially a superset of `x86`, so we just lump it all into //! one module. Anything not available on `x86` simply won't be in the build //! on that arch. use super::*; submodule!(pub m128_); submodule!(pub m128d_); submodule!(pub m128i_); submodule!(pub m256_); submodule!(pub m256d_); submodule!(pub m256i_); // Note(Lokathor): We only include these sub-modules with the actual functions // if the feature is enabled. Ae *also* have a cfg attribute on the inside of // the modules as a "double-verification" of sorts. Technically either way on // its own would also be fine. // These CPU features follow a fairly clear and strict progression that's easy // to remember. Most of them offer a fair pile of new functions. #[cfg(target_feature = "sse")] submodule!(pub sse); #[cfg(target_feature = "sse2")] submodule!(pub sse2); #[cfg(target_feature = "sse3")] submodule!(pub sse3); #[cfg(target_feature = "ssse3")] submodule!(pub ssse3); #[cfg(target_feature = "sse4.1")] submodule!(pub sse4_1); #[cfg(target_feature = "sse4.2")] submodule!(pub sse4_2); #[cfg(target_feature = "avx")] submodule!(pub avx); #[cfg(target_feature = "avx2")] submodule!(pub avx2); // These features aren't as easy to remember the progression of and they each // only add a small handful of functions. #[cfg(target_feature = "adx")] submodule!(pub adx); #[cfg(target_feature = "aes")] submodule!(pub aes); #[cfg(target_feature = "bmi1")] submodule!(pub bmi1); #[cfg(target_feature = "bmi2")] submodule!(pub bmi2); #[cfg(target_feature = "fma")] submodule!(pub fma); #[cfg(target_feature = "lzcnt")] submodule!(pub lzcnt); #[cfg(target_feature = "pclmulqdq")] submodule!(pub pclmulqdq); #[cfg(target_feature = "popcnt")] submodule!(pub popcnt); #[cfg(target_feature = "rdrand")] submodule!(pub rdrand); #[cfg(target_feature = "rdseed")] submodule!(pub rdseed); /// Reads the CPU's timestamp counter value. /// /// This is a monotonically increasing time-stamp that goes up every clock /// cycle of the CPU. However, since modern CPUs are variable clock rate /// depending on demand this can't actually be used for telling the time. It /// also does _not_ fully serialize all operations, so previous instructions /// might still be in progress when this reads the timestamp. /// /// * **Intrinsic:** `_rdtsc` /// * **Assembly:** `rdtsc` pub fn read_timestamp_counter() -> u64 { // Note(Lokathor): This was changed from i64 to u64 at some point, but // everyone ever was already casting this value to `u64` so crater didn't // even consider it a problem. We will follow suit. #[allow(clippy::unnecessary_cast)] unsafe { _rdtsc() as u64 } } /// Reads the CPU's timestamp counter value and store the processor signature. /// /// This works similar to [`read_timestamp_counter`] with two main /// differences: /// * It and also stores the `IA32_TSC_AUX MSR` value to the reference given. /// * It waits on all previous instructions to finish before reading the /// timestamp (though it doesn't prevent other instructions from starting). /// /// As with `read_timestamp_counter`, you can't actually use this to tell the /// time. /// /// * **Intrinsic:** `__rdtscp` /// * **Assembly:** `rdtscp` pub fn read_timestamp_counter_p(aux: &mut u32) -> u64 { unsafe { __rdtscp(aux) } } /// Swap the bytes of the given 32-bit value. /// /// ``` /// # use safe_arch::*; /// assert_eq!(byte_swap_i32(0x0A123456), 0x5634120A); /// ``` /// * **Intrinsic:** `_bswap` /// * **Assembly:** `bswap r32` pub fn byte_swap_i32(i: i32) -> i32 { unsafe { _bswap(i) } } /// Swap the bytes of the given 64-bit value. /// /// ``` /// # use safe_arch::*; /// assert_eq!(byte_swap_i64(0x0A123456_789ABC01), 0x01BC9A78_5634120A); /// ``` /// * **Intrinsic:** `_bswap64` /// * **Assembly:** `bswap r64` #[cfg(target_arch="x86_64")] pub fn byte_swap_i64(i: i64) -> i64 { unsafe { _bswap64(i) } } /// Turns a round operator token to the correct constant value. #[macro_export] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] macro_rules! round_op { (Nearest) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::{ _MM_FROUND_NO_EXC, _MM_FROUND_TO_NEAREST_INT, }; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::{ _MM_FROUND_NO_EXC, _MM_FROUND_TO_NEAREST_INT, }; _MM_FROUND_NO_EXC | _MM_FROUND_TO_NEAREST_INT }}; (NegInf) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::{ _MM_FROUND_NO_EXC, _MM_FROUND_TO_NEG_INF, }; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::{ _MM_FROUND_NO_EXC, _MM_FROUND_TO_NEG_INF, }; _MM_FROUND_NO_EXC | _MM_FROUND_TO_NEG_INF }}; (PosInf) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::{ _MM_FROUND_NO_EXC, _MM_FROUND_TO_POS_INF, }; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::{ _MM_FROUND_NO_EXC, _MM_FROUND_TO_POS_INF, }; _MM_FROUND_NO_EXC | _MM_FROUND_TO_POS_INF }}; (Zero) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::{ _mm256_round_pd, _MM_FROUND_NO_EXC, _MM_FROUND_TO_ZERO, }; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::{ _mm256_round_pd, _MM_FROUND_NO_EXC, _MM_FROUND_TO_ZERO, }; _MM_FROUND_NO_EXC | _MM_FROUND_TO_ZERO }}; } }); safe_arch-0.7.1/src/naming_conventions.rs000066400000000000000000000243311445526200400204720ustar00rootroot00000000000000//! An explanation of the crate's naming conventions. //! //! This crate attempts to follow the general naming scheme of `verb_type` when //! the operation is "simple", and `verb_description_words_type` when the //! operation (op) needs to be more specific than normal. Like this: //! * `add_m128` //! * `add_saturating_i8_m128i` //! //! ## Types //! Currently, only `x86` and `x86_64` types are supported. Among those types: //! * `m128` and `m256` are always considered to hold `f32` lanes. //! * `m128d` and `m256d` are always considered to hold `f64` lanes. //! * `m128i` and `m256i` hold integer data, but each op specifies what lane //! width of integers the operation uses. //! * If the type has `_s` on the end then it's a "scalar" operation that //! affects just the lowest lane. The other lanes are generally copied forward //! from one of the inputs, though the details there vary from op to op. //! * The SIMD types are often referred to as "registers" because each SIMD //! typed value represents exactly one CPU register when you're doing work. //! //! ## Operations //! There's many operations that can be performed. When possible, `safe_arch` //! tries to follow normal Rust naming (eg: adding is still `add` and left //! shifting is still `shl`), but if an operation doesn't normally exist at all //! in Rust then we basically have to make something up. //! //! Many operations have more than one variant, such as `add` and also //! `add_saturating`. In this case, `safe_arch` puts the "core operation" first //! and then any "modifiers" go after, which isn't how you might normally say it //! in English, but it makes the list of functions sort better. //! //! As a general note on SIMD terminology: When an operation uses the same //! indexed lane in two _different_ registers to determine the output, that is a //! "vertical" operation. When an operation uses more than one lane in the //! _same_ register to determine the output, that is a "horizontal" operation. //! * Vertical: `out[0] = a[0] + b[0]`, `out[1] = a[1] + b[1]` //! * Horizontal: `out[0] = a[0] + a[1]`, `out[1] = b[0] + b[1]` //! //! ## Operation Glossary //! Here follows the list of all the main operations and their explanations. //! //! * `abs`: Absolute value (wrapping). //! * `add`: Addition. This is "wrapping" by default, though some other types of //! addition are available. Remember that wrapping signed addition is the same //! as wrapping unsigned addition. //! * `average`: Averages the two inputs. //! * `bitand`: Bitwise And, `a & b`, like [the trait](core::ops::BitAnd). //! * `bitandnot`: Bitwise `(!a) & b`. This seems a little funny at first but //! it's useful for clearing bits. The output will be based on the `b` side's //! bit pattern, but with all active bits in `a` cleared: //! * `bitandnot(0b0010, 0b1011) == 0b1001` //! * `bitor`: Bitwise Or, `a | b`, like [the trait](core::ops::BitOr). //! * `bitxor`: Bitwise eXclusive Or, `a ^ b`, like [the //! trait](core::ops::BitXor). //! * `blend`: Merge the data lanes of two SIMD values by taking either the `b` //! value or `a` value for each lane. Depending on the instruction, the blend //! mask can be either an immediate or a runtime value. //! * `cast`: Convert between data types while preserving the exact bit //! patterns, like how [`transmute`](core::mem::transmute) works. //! * `ceil`: "Ceiling", rounds towards positive infinity. //! * `cmp`: Numeric comparisons of various kinds. This generally gives "mask" //! output where the output value is of the same data type as the inputs, but //! with all the bits in a "true" lane as 1 and all the bits in a "false" lane //! as 0. Remember that with floating point values all 1s bits is a NaN, and //! with signed integers all 1s bits is -1. //! * An "Ordered comparison" checks if _neither_ floating point value is NaN. //! * An "Unordered comparison" checks if _either_ floating point value is //! NaN. //! * `convert`: This does some sort of numeric type change. The details can //! vary wildly. Generally, if the number of lanes goes down then the lowest //! lanes will be kept. If the number of lanes goes up then the new high lanes //! will be zero. //! * `div`: Division. //! * `dot_product`: This works like the matrix math operation. The lanes are //! multiplied and then the results are summed up into a single value. //! * `duplicate`: Copy the even or odd indexed lanes to the other set of lanes. //! Eg, `[1, 2, 3, 4]` becomes `[1, 1, 3, 3]` or `[2, 2, 4, 4]`. //! * `extract`: Get a value from the lane of a SIMD type into a scalar type. //! * `floor`: Rounds towards negative infinity. //! * `fused`: All the fused operations are a multiply as well as some sort of //! adding or subtracting. The details depend on which fused operation you //! select. The benefit of this operation over a non-fused operation are that //! it can compute slightly faster than doing the mul and add separately, and //! also the output can have higher accuracy in the result. //! * `insert`: The opposite of `extract`, this puts a new value into a //! particular lane of a SIMD type. //! * `load`: Reads an address and makes a SIMD register value. The details can //! vary because there's more than one type of `load`, but generally this is a //! `&T -> U` style operation. //! * `max`: Picks the larger value from each of the two inputs. //! * `min`: Picks the smaller value from each of the two inputs. //! * `mul`: Multiplication. For floating point this is just "normal" //! multiplication, but for integer types you tend to have some options. An //! integer multiplication of X bits will produce a 2X bit output, so //! generally you'll get to pick if you want to keep the high half of that, //! the low half of that (a normal "wrapping" mul), or "widen" the outputs to //! be all the bits at the expense of not multiplying half the lanes the //! lanes. //! * `pack`: Take the integers in the `a` and `b` inputs, reduce them to fit //! within the half-sized integer type (eg: `i16` to `i8`), and pack them all //! together into the output. //! * `population`: The "population" operations refer to the bits within an //! integer. Either counting them or adjusting them in various ways. //! * `rdrand`: Use the hardware RNG to make a random value of the given length. //! * `rdseed`: Use the hardware RNG to make a random seed of the given length. //! This is less commonly available, but theoretically an improvement over //! `rdrand` in that if you have to combine more than one usage of this //! operation to make your full seed size then the guess difficulty rises at a //! multiplicative rate instead of just an additive rate. For example, two //! `u64` outputs concatenated to a single `u128` have a guess difficulty of //! 2^(64*64) with `rdseed` but only 2^(64+64) with `rdrand`. //! * `read_timestamp_counter`: Lets you read the CPU's cycle counter, which //! doesn't strictly mean anything in particular since even the CPU's clock //! rate isn't even stable over time, but you might find it interesting as an //! approximation during benchmarks, or something like that. //! * `reciprocal`: Turns `x` into `1/x`. Can also be combined with a `sqrt` //! operation. //! * `round`: Convert floating point values to whole numbers, according to one //! of several available methods. //! * `set`: Places a list of scalar values into a SIMD lane. Conceptually //! similar to how building an array works in Rust. //! * `splat`: Not generally an operation of its own, but a modifier to other //! operations such as `load` and `set`. This will copy a given value across a //! SIMD type as many times as it can be copied. For example, a 32-bit value //! splatted into a 128-bit register will be copied four times. //! * `shl`: Bit shift left. New bits shifted in are always 0. Because the shift //! is the same for both signed and unsigned values, this crate simply marks //! left shift as always being an unsigned operation. //! * You can shift by an immediate value ("imm"), all lanes by the same value //! ("all"), or each lane by its own value ("each"). //! * `shr`: Bit shift right. This comes in two forms: "Arithmetic" shifts shift //! in the starting sign bit (which preserves the sign of the value), and //! "Logical" shifts shift in 0 regardless of the starting sign bit (so the //! result ends up being positive). With normal Rust types, signed integers //! use arithmetic shifts and unsigned integers use logical shifts, so these //! functions are marked as being for signed or unsigned integers //! appropriately. //! * As with `shl`, you can shift by an immediate value ("imm"), all lanes by //! the same value ("all"), or each lane by its own value ("each"). //! * `sign_apply`: Multiplies one set of values by the signum (1, 0, or -1) of //! another set of values. //! * `sqrt`: Square Root. //! * `store`: Writes a SIMD value to a memory location. //! * `string_search`: A rather specialized instruction that lets you do byte //! based searching within a register. This lets you do some very high speed //! searching through ASCII strings when the stars align. //! * `sub`: Subtract. //! * `shuffle`: This lets you re-order the data lanes. Sometimes x86/x64 calls //! this is called "shuffle", and sometimes it's called "permute", and there's //! no particular reasoning behind the different names, so we just call them //! all shuffle. //! * `shuffle_{args}_{lane-type}_{lane-sources}_{simd-type}`. //! * "args" is the input arguments: `a` (one arg) or `ab` (two args), then //! either `v` (runtime-varying) or `i` (immediate). All the immediate //! shuffles are macros, of course. //! * "lane type" is `f32`, `f64`, `i8`, etc. If there's a `z` after the type //! then you'll also be able to zero an output position instead of making it //! come from a particular source lane. //! * "lane sources" is generally either "all" which means that all lanes can //! go to all other lanes, or "half" which means that each half of the lanes //! is isolated from the other half, and you can't cross data between the //! two halves, only within a half (this is how most of the 256-bit x86/x64 //! shuffles work). //! * `unpack`: Takes a SIMD value and gets out some of the lanes while widening //! them, such as converting `i16` to `i32`. safe_arch-0.7.1/src/x86_x64/000077500000000000000000000000001445526200400153515ustar00rootroot00000000000000safe_arch-0.7.1/src/x86_x64/adx.rs000066400000000000000000000015721445526200400165000ustar00rootroot00000000000000#![cfg(target_feature = "adx")] use super::*; /// Add two `u32` with a carry value. /// /// Writes the sum to the reference, and returns the new carry flag. /// /// * **Intrinsic:** [`_addcarryx_u32`] /// * **Assembly:** /// `adcx r32, r32` /// `adox r32, r32` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "adx")))] pub fn add_carry_u32(c_in: u8, a: u32, b: u32, out: &mut u32) -> u8 { unsafe { _addcarryx_u32(c_in, a, b, out) } } /// Add two `u64` with a carry value. /// /// Writes the sum to the reference and returns the new carry flag. /// /// * **Intrinsic:** [`_addcarryx_u64`] /// * **Assembly:** /// `adcx r64, r64` /// `adox r64, r64` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "adx")))] #[cfg(target_arch = "x86_64")] pub fn add_carry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { unsafe { _addcarryx_u64(c_in, a, b, out) } } safe_arch-0.7.1/src/x86_x64/aes.rs000066400000000000000000000046021445526200400164710ustar00rootroot00000000000000#![cfg(target_feature = "aes")] use super::*; /// Perform one round of an AES decryption flow on `a` using the `round_key`. /// /// * **Intrinsic:** [`_mm_aesdec_si128`] /// * **Assembly:** `aesdec xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "aes")))] pub fn aes_decrypt_m128i(a: m128i, round_key: m128i) -> m128i { m128i(unsafe { _mm_aesdec_si128(a.0, round_key.0) }) } /// Perform the last round of an AES decryption flow on `a` using the /// `round_key`. /// /// * **Intrinsic:** [`_mm_aesdeclast_si128`] /// * **Assembly:** `aesdeclast xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "aes")))] pub fn aes_decrypt_last_m128i(a: m128i, round_key: m128i) -> m128i { m128i(unsafe { _mm_aesdeclast_si128(a.0, round_key.0) }) } /// Perform one round of an AES encryption flow on `a` using the `round_key`. /// /// * **Intrinsic:** [`_mm_aesenc_si128`] /// * **Assembly:** `aesenc xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "aes")))] pub fn aes_encrypt_m128i(a: m128i, round_key: m128i) -> m128i { m128i(unsafe { _mm_aesenc_si128(a.0, round_key.0) }) } /// Perform the last round of an AES encryption flow on `a` using the /// `round_key`. /// /// * **Intrinsic:** [`_mm_aesenclast_si128`] /// * **Assembly:** `aesenclast xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "aes")))] pub fn aes_encrypt_last_m128i(a: m128i, round_key: m128i) -> m128i { m128i(unsafe { _mm_aesenclast_si128(a.0, round_key.0) }) } /// Perform the InvMixColumns transform on `a`. /// /// * **Intrinsic:** [`_mm_aesimc_si128`] /// * **Assembly:** `aesimc xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "aes")))] pub fn aes_inv_mix_columns_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_aesimc_si128(a.0) }) } /// Assist in expanding an AES cipher key. /// /// This computes steps towards generating a round key for an encryption cipher /// using data from `a` and an 8-bit round constant specified by the `IMM` /// constant used. /// /// * **Intrinsic:** [`_mm_aeskeygenassist_si128`] /// * **Assembly:** `aeskeygenassist xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "aes")))] pub fn aes_key_gen_assist_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_aeskeygenassist_si128(a.0, IMM) }) } safe_arch-0.7.1/src/x86_x64/avx.rs000066400000000000000000001710171445526200400165240ustar00rootroot00000000000000#![cfg(target_feature = "avx")] use super::*; /// Lanewise `a + b` with `f64` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn add_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_add_pd(a.0, b.0) }) } /// Lanewise `a + b` with `f32` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn add_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_add_ps(a.0, b.0) }) } /// Alternately, from the top, add `f64` then sub `f64`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn addsub_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_addsub_pd(a.0, b.0) }) } /// Alternately, from the top, add `f32` then sub `f32`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn addsub_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_addsub_ps(a.0, b.0) }) } /// Bitwise `a & b`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn bitand_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_and_pd(a.0, b.0) }) } /// Bitwise `a & b`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn bitand_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_and_ps(a.0, b.0) }) } /// Bitwise `(!a) & b`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn bitandnot_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_andnot_pd(a.0, b.0) }) } /// Bitwise `(!a) & b`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn bitandnot_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_andnot_ps(a.0, b.0) }) } /// Blends the `f64` lanes according to the immediate mask. /// /// Each bit 0 though 3 controls output lane 0 through 3. Use 0 for the `a` /// value and 1 for the `b` value. /// /// * **Intrinsic:** [``] /// * **Assembly:** pub fn blend_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_blend_pd(a.0, b.0, IMM) }) } /// Blends the `f32` lanes according to the immediate mask. /// /// Each bit 0 though 7 controls lane 0 through 7. Use 0 for the `$a` value and /// 1 for the `$b` value. /// /// * **Intrinsic:** [``] /// * **Assembly:** pub fn blend_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_blend_ps(a.0, b.0, IMM) }) } /// Blend the lanes according to a runtime varying mask. /// /// The sign bit of each lane in the `mask` value determines if the output /// lane uses `a` (mask non-negative) or `b` (mask negative). /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn blend_varying_m256d(a: m256d, b: m256d, mask: m256d) -> m256d { m256d(unsafe { _mm256_blendv_pd(a.0, b.0, mask.0) }) } /// Blend the lanes according to a runtime varying mask. /// /// The sign bit of each lane in the `mask` value determines if the output /// lane uses `a` (mask non-negative) or `b` (mask negative). /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn blend_varying_m256(a: m256, b: m256, mask: m256) -> m256 { m256(unsafe { _mm256_blendv_ps(a.0, b.0, mask.0) }) } /// Load an `m128d` and splat it to the lower and upper half of an `m256d` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_m128d_splat_m256d(a: &m128d) -> m256d { m256d(unsafe { _mm256_broadcast_pd(&a.0) }) } /// Load an `m128` and splat it to the lower and upper half of an `m256` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_m128_splat_m256(a: &m128) -> m256 { m256(unsafe { _mm256_broadcast_ps(&a.0) }) } /// Load an `f64` and splat it to all lanes of an `m256d` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_f64_splat_m256d(a: &f64) -> m256d { m256d(unsafe { _mm256_broadcast_sd(a) }) } /// Load an `f32` and splat it to all lanes of an `m256d` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_f32_splat_m256(a: &f32) -> m256 { m256(unsafe { _mm256_broadcast_ss(a) }) } /// Bit-preserving cast to `m256` from `m256d`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cast_to_m256_from_m256d(a: m256d) -> m256 { m256(unsafe { _mm256_castpd_ps(a.0) }) } /// Bit-preserving cast to `m256i` from `m256d`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cast_to_m256i_from_m256d(a: m256d) -> m256i { m256i(unsafe { _mm256_castpd_si256(a.0) }) } /// Bit-preserving cast to `m256i` from `m256`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cast_to_m256d_from_m256(a: m256) -> m256d { m256d(unsafe { _mm256_castps_pd(a.0) }) } /// Bit-preserving cast to `m256i` from `m256`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cast_to_m256i_from_m256(a: m256) -> m256i { m256i(unsafe { _mm256_castps_si256(a.0) }) } /// Bit-preserving cast to `m256d` from `m256i`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cast_to_m256d_from_m256i(a: m256i) -> m256d { m256d(unsafe { _mm256_castsi256_pd(a.0) }) } /// Bit-preserving cast to `m256` from `m256i`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cast_to_m256_from_m256i(a: m256i) -> m256 { m256(unsafe { _mm256_castsi256_ps(a.0) }) } /// Bit-preserving cast to `m128` from `m256`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cast_to_m128_from_m256(a: m256) -> m128 { m128(unsafe { _mm256_castps256_ps128(a.0) }) } /// Bit-preserving cast to `m128d` from `m256d`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cast_to_m128d_from_m256d(a: m256d) -> m128d { m128d(unsafe { _mm256_castpd256_pd128(a.0) }) } /// Bit-preserving cast to `m128i` from `m256i`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cast_to_m128i_from_m256i(a: m256i) -> m128i { m128i(unsafe { _mm256_castsi256_si128(a.0) }) } /// Round `f64` lanes towards positive infinity. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn ceil_m256d(a: m256d) -> m256d { m256d(unsafe { _mm256_ceil_pd(a.0) }) } /// Round `f32` lanes towards positive infinity. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn ceil_m256(a: m256) -> m256 { m256(unsafe { _mm256_ceil_ps(a.0) }) } /// Turns a comparison operator token to the correct constant value. #[macro_export] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] macro_rules! cmp_op { (EqualOrdered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_EQ_OQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_EQ_OQ; _CMP_EQ_OQ }}; (EqualUnordered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_EQ_UQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_EQ_UQ; _CMP_EQ_UQ }}; (False) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_FALSE_OQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_FALSE_OQ; _CMP_FALSE_OQ }}; (GreaterEqualOrdered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_GE_OQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_GE_OQ; _CMP_GE_OQ }}; (GreaterThanOrdered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_GT_OQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_GT_OQ; _CMP_GT_OQ }}; (LessEqualOrdered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_LE_OQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_LE_OQ; _CMP_LE_OQ }}; (LessThanOrdered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_LT_OQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_LT_OQ; _CMP_LT_OQ }}; (NotEqualOrdered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_NEQ_OQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_NEQ_OQ; _CMP_NEQ_OQ }}; (NotEqualUnordered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_NEQ_UQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_NEQ_UQ; _CMP_NEQ_UQ }}; (NotGreaterEqualUnordered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_NGE_UQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_NGE_UQ; _CMP_NGE_UQ }}; (NotGreaterThanUnordered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_NGT_UQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_NGT_UQ; _CMP_NGT_UQ }}; (NotLessEqualUnordered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_NLE_UQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_NLE_UQ; _CMP_NLE_UQ }}; (NotLessThanUnordered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_NLT_UQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_NLT_UQ; _CMP_NLT_UQ }}; (Ordered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_ORD_Q; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_ORD_Q; _CMP_ORD_Q }}; (True) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_TRUE_UQ; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_TRUE_UQ; _CMP_TRUE_UQ }}; (Unordered) => {{ #[cfg(target_arch = "x86")] use ::core::arch::x86::_CMP_UNORD_Q; #[cfg(target_arch = "x86_64")] use ::core::arch::x86_64::_CMP_UNORD_Q; _CMP_UNORD_Q }}; ($unknown_op:tt) => {{ compile_error!("The operation name given is invalid."); }}; } /// Compare `f32` lanes according to the operation specified, mask output. /// /// * Operators are according to the [`cmp_op`] macro. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cmp_op_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmp_ps(a.0, b.0, OP) }) } /// Compare `f32` lanes according to the operation specified, mask output. /// /// * Operators are according to the [`cmp_op`] macro. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cmp_op_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmp_ss(a.0, b.0, OP) }) } /// Compare `f32` lanes according to the operation specified, mask output. /// /// * Operators are according to the [`cmp_op`] macro. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cmp_op_mask_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_cmp_ps(a.0, b.0, OP) }) } /// Compare `f64` lanes according to the operation specified, mask output. /// /// * Operators are according to the [`cmp_op`] macro. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cmp_op_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmp_pd(a.0, b.0, OP) }) } /// Compare `f64` lanes according to the operation specified, mask output. /// /// * Operators are according to the [`cmp_op`] macro. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cmp_op_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmp_sd(a.0, b.0, OP) }) } /// Compare `f64` lanes according to the operation specified, mask output. /// /// * Operators are according to the [`cmp_op`] macro. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn cmp_op_mask_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_cmp_pd(a.0, b.0, OP) }) } /// Convert `i32` lanes to be `f64` lanes. /// /// * **Intrinsic:** [`_mm256_cvtepi32_pd`] /// * **Assembly:** `vcvtdq2pd ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_to_m256d_from_i32_m128i(a: m128i) -> m256d { m256d(unsafe { _mm256_cvtepi32_pd(a.0) }) } /// Convert `i32` lanes to be `f32` lanes. /// /// * **Intrinsic:** [`_mm256_cvtepi32_ps`] /// * **Assembly:** `vcvtdq2ps ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_to_m256_from_i32_m256i(a: m256i) -> m256 { m256(unsafe { _mm256_cvtepi32_ps(a.0) }) } /// Convert `f64` lanes to be `i32` lanes. /// /// * **Intrinsic:** [`_mm256_cvtpd_epi32`] /// * **Assembly:** `vcvtpd2dq xmm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_to_i32_m128i_from_m256d(a: m256d) -> m128i { m128i(unsafe { _mm256_cvtpd_epi32(a.0) }) } /// Convert `f64` lanes to be `f32` lanes. /// /// * **Intrinsic:** [`_mm256_cvtpd_ps`] /// * **Assembly:** `vcvtpd2ps xmm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_to_m128_from_m256d(a: m256d) -> m128 { m128(unsafe { _mm256_cvtpd_ps(a.0) }) } /// Convert `f32` lanes to be `i32` lanes. /// /// * **Intrinsic:** [`_mm256_cvtps_epi32`] /// * **Assembly:** `vcvtps2dq ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_to_i32_m256i_from_m256(a: m256) -> m256i { m256i(unsafe { _mm256_cvtps_epi32(a.0) }) } /// Convert `f32` lanes to be `f64` lanes. /// /// * **Intrinsic:** [`_mm256_cvtps_pd`] /// * **Assembly:** `vcvtps2pd ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_to_m256d_from_m128(a: m128) -> m256d { m256d(unsafe { _mm256_cvtps_pd(a.0) }) } /// Convert the lowest `f64` lane to a single `f64`. /// /// * **Intrinsic:** [`_mm256_cvtsd_f64`] /// * **Assembly:** `vmovsd m64, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_to_f64_from_m256d_s(a: m256d) -> f64 { unsafe { _mm256_cvtsd_f64(a.0) } } /// Convert the lowest `i32` lane to a single `i32`. /// /// * **Intrinsic:** [`_mm256_cvtsi256_si32`] /// * **Assembly:** `vmovd r32, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_to_i32_from_m256i_s(a: m256i) -> i32 { unsafe { _mm256_cvtsi256_si32(a.0) } } /// Convert the lowest `f32` lane to a single `f32`. /// /// * **Intrinsic:** [`_mm256_cvtss_f32`] /// * **Assembly:** `vmovss m32, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_to_f32_from_m256_s(a: m256) -> f32 { unsafe { _mm256_cvtss_f32(a.0) } } /// Convert `f64` lanes to `i32` lanes with truncation. /// /// * **Intrinsic:** [`_mm256_cvttpd_epi32`] /// * **Assembly:** `vcvttpd2dq xmm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_truncate_to_i32_m128i_from_m256d(a: m256d) -> m128i { m128i(unsafe { _mm256_cvttpd_epi32(a.0) }) } /// Convert `f32` lanes to `i32` lanes with truncation. /// /// * **Intrinsic:** [`_mm256_cvttps_epi32`] /// * **Assembly:** `` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn convert_truncate_to_i32_m256i_from_m256(a: m256) -> m256i { m256i(unsafe { _mm256_cvttps_epi32(a.0) }) } /// Lanewise `a / b` with `f64`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn div_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_div_pd(a.0, b.0) }) } /// Lanewise `a / b` with `f32`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn div_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_div_ps(a.0, b.0) }) } /// This works like [`dot_product_m128`], but twice as wide. /// /// The given control is used for the lower 4 lanes and then separately also the /// upper four lanes. See the other macro for more info on how the control /// works. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn dot_product_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_dp_ps(a.0, b.0, IMM) }) } /// Extracts an `i32` lane from `m256i` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn extract_i32_from_m256i(a: m256i) -> i32 { unsafe { _mm256_extract_epi32(a.0, IMM) } } /// Extracts an `i64` lane from `m256i` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[cfg(target_arch = "x86_64")] pub fn extract_i64_from_m256i(a: m256i) -> i64 { unsafe { _mm256_extract_epi64(a.0, IMM) } } /// Extracts an `m128d` from `m256d` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn extract_m128d_from_m256d(a: m256d) -> m128d { m128d(unsafe { _mm256_extractf128_pd(a.0, IMM) }) } /// Extracts an `m128` from `m256` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn extract_m128_from_m256(a: m256) -> m128 { m128(unsafe { _mm256_extractf128_ps(a.0, IMM) }) } /// Extracts an `m128i` from `m256i` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn extract_m128i_from_m256i(a: m256i) -> m128i { m128i(unsafe { _mm256_extractf128_si256(a.0, IMM) }) } /// Round `f64` lanes towards negative infinity. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn floor_m256d(a: m256d) -> m256d { m256d(unsafe { _mm256_floor_pd(a.0) }) } /// Round `f32` lanes towards negative infinity. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn floor_m256(a: m256) -> m256 { m256(unsafe { _mm256_floor_ps(a.0) }) } /// Add adjacent `f64` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn add_horizontal_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_hadd_pd(a.0, b.0) }) } /// Add adjacent `f32` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn add_horizontal_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_hadd_ps(a.0, b.0) }) } /// Subtract adjacent `f64` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn sub_horizontal_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_hsub_pd(a.0, b.0) }) } /// Subtract adjacent `f32` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn sub_horizontal_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_hsub_ps(a.0, b.0) }) } /// Inserts an `i8` to `m256i` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn insert_i8_to_m256i(a: m256i, i: i8) -> m256i { m256i(unsafe { _mm256_insert_epi8(a.0, i, IMM) }) } /// Inserts an `i16` to `m256i` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn insert_i16_to_m256i(a: m256i, i: i16) -> m256i { m256i(unsafe { _mm256_insert_epi16(a.0, i, IMM) }) } /// Inserts an `i32` to `m256i` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn insert_i32_to_m256i(a: m256i, i: i32) -> m256i { m256i(unsafe { _mm256_insert_epi32(a.0, i, IMM) }) } /// Inserts an `i64` to `m256i` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[cfg(target_arch = "x86_64")] pub fn insert_i64_to_m256i(a: m256i, i: i64) -> m256i { m256i(unsafe { _mm256_insert_epi64(a.0, i, IMM) }) } /// Inserts an `m128d` to `m256d` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn insert_m128d_to_m256d(a: m256d, b: m128d) -> m256d { m256d(unsafe { _mm256_insertf128_pd(a.0, b.0, IMM) }) } /// Inserts an `m128` to `m256` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn insert_m128_to_m256(a: m256, b: m128) -> m256 { m256(unsafe { _mm256_insertf128_ps(a.0, b.0, IMM) }) } /// Slowly inserts an `m128i` to `m256i`. /// /// This is a "historical artifact" that was potentially useful if you have AVX /// but not AVX2. If you plan on having AVX2 available please use /// [`insert_m128i_to_m256i`], it will do the same task with better performance. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn insert_m128i_to_m256i_slow_avx(a: m256i, b: m128i) -> m256i { m256i(unsafe { _mm256_insertf128_si256(a.0, b.0, IMM) }) } /// Load data from memory into a register. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_m256d(a: &m256d) -> m256d { m256d(unsafe { _mm256_load_pd(a as *const m256d as *const f64) }) } /// Load data from memory into a register. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_m256(a: &m256) -> m256 { m256(unsafe { _mm256_load_ps(a as *const m256 as *const f32) }) } /// Load data from memory into a register. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_m256i(a: &m256i) -> m256i { m256i(unsafe { _mm256_load_si256(a as *const m256i as *const __m256i) }) } /// Load data from memory into a register. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_unaligned_m256d(a: &[f64; 4]) -> m256d { m256d(unsafe { _mm256_loadu_pd(a as *const [f64; 4] as *const f64) }) } /// Load data from memory into a register. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_unaligned_m256(a: &[f32; 8]) -> m256 { m256(unsafe { _mm256_loadu_ps(a as *const [f32; 8] as *const f32) }) } /// Load data from memory into a register. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_unaligned_m256i(a: &[i8; 32]) -> m256i { m256i(unsafe { _mm256_loadu_si256(a as *const [i8; 32] as *const __m256i) }) } /// Load data from memory into a register. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_unaligned_hi_lo_m256d(a: &[f64; 2], b: &[f64; 2]) -> m256d { m256d(unsafe { _mm256_loadu2_m128d(a as *const [f64; 2] as *const f64, b as *const [f64; 2] as *const f64) }) } /// Load data from memory into a register. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_unaligned_hi_lo_m256(a: &[f32; 4], b: &[f32; 4]) -> m256 { m256(unsafe { _mm256_loadu2_m128(a as *const [f32; 4] as *const f32, b as *const [f32; 4] as *const f32) }) } /// Load data from memory into a register. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_unaligned_hi_lo_m256i(a: &[i8; 16], b: &[i8; 16]) -> m256i { m256i(unsafe { _mm256_loadu2_m128i(a as *const [i8; 16] as *const __m128i, b as *const [i8; 16] as *const __m128i) }) } /// Load data from memory into a register according to a mask. /// /// When the high bit of a mask lane isn't set the loaded lane will be zero. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_masked_m128d(a: &m128d, mask: m128i) -> m128d { m128d(unsafe { _mm_maskload_pd(a as *const m128d as *const f64, mask.0) }) } /// Load data from memory into a register according to a mask. /// /// When the high bit of a mask lane isn't set the loaded lane will be zero. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_masked_m256d(a: &m256d, mask: m256i) -> m256d { m256d(unsafe { _mm256_maskload_pd(a as *const m256d as *const f64, mask.0) }) } /// Load data from memory into a register according to a mask. /// /// When the high bit of a mask lane isn't set the loaded lane will be zero. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_masked_m128(a: &m128, mask: m128i) -> m128 { m128(unsafe { _mm_maskload_ps(a as *const m128 as *const f32, mask.0) }) } /// Load data from memory into a register according to a mask. /// /// When the high bit of a mask lane isn't set the loaded lane will be zero. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn load_masked_m256(a: &m256, mask: m256i) -> m256 { m256(unsafe { _mm256_maskload_ps(a as *const m256 as *const f32, mask.0) }) } /// Store data from a register into memory according to a mask. /// /// When the high bit of a mask lane isn't set that lane is not written. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_masked_m128d(addr: &mut m128d, mask: m128i, a: m128d) { unsafe { _mm_maskstore_pd(addr as *mut m128d as *mut f64, mask.0, a.0) } } /// Store data from a register into memory according to a mask. /// /// When the high bit of a mask lane isn't set that lane is not written. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_masked_m256d(addr: &mut m256d, mask: m256i, a: m256d) { unsafe { _mm256_maskstore_pd(addr as *mut m256d as *mut f64, mask.0, a.0) } } /// Store data from a register into memory according to a mask. /// /// When the high bit of a mask lane isn't set that lane is not written. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_masked_m128(addr: &mut m128, mask: m128i, a: m128) { unsafe { _mm_maskstore_ps(addr as *mut m128 as *mut f32, mask.0, a.0) } } /// Store data from a register into memory according to a mask. /// /// When the high bit of a mask lane isn't set that lane is not written. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_masked_m256(addr: &mut m256, mask: m256i, a: m256) { unsafe { _mm256_maskstore_ps(addr as *mut m256 as *mut f32, mask.0, a.0) } } /// Lanewise `max(a, b)`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn max_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_max_pd(a.0, b.0) }) } /// Lanewise `max(a, b)`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn max_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_max_ps(a.0, b.0) }) } /// Lanewise `min(a, b)`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn min_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_min_pd(a.0, b.0) }) } /// Lanewise `min(a, b)`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn min_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_min_ps(a.0, b.0) }) } /// Duplicate the odd-indexed lanes to the even lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn duplicate_odd_lanes_m256d(a: m256d) -> m256d { m256d(unsafe { _mm256_movedup_pd(a.0) }) } /// Duplicate the even-indexed lanes to the odd lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn duplicate_even_lanes_m256(a: m256) -> m256 { m256(unsafe { _mm256_movehdup_ps(a.0) }) } /// Duplicate the odd-indexed lanes to the even lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn duplicate_odd_lanes_m256(a: m256) -> m256 { m256(unsafe { _mm256_moveldup_ps(a.0) }) } /// Collects the sign bit of each lane into a 4-bit value. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn move_mask_m256d(a: m256d) -> i32 { unsafe { _mm256_movemask_pd(a.0) } } /// Collects the sign bit of each lane into a 4-bit value. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn move_mask_m256(a: m256) -> i32 { unsafe { _mm256_movemask_ps(a.0) } } /// Lanewise `a * b` with `f64` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn mul_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_mul_pd(a.0, b.0) }) } /// Lanewise `a * b` with `f32` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn mul_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_mul_ps(a.0, b.0) }) } /// Bitwise `a | b`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn bitor_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_or_pd(a.0, b.0) }) } /// Bitwise `a | b`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn bitor_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_or_ps(a.0, b.0) }) } /// Shuffle the `f64` lanes in `a` using an immediate control value. /// /// * **Intrinsic:** [`_mm_permute_pd`] /// * **Assembly:** `vpermilpd xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn permute_m128d(a: m128d) -> m128d { m128d(unsafe { _mm_permute_pd(a.0, MASK) }) } /// Shuffle the `f64` lanes from `a` together using an immediate /// control value. /// /// * **Intrinsic:** [`_mm256_permute_pd`] /// * **Assembly:** `vpermilpd ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn permute_m256d(a: m256d) -> m256d { m256d(unsafe { _mm256_permute_pd(a.0, MASK) }) } /// Shuffle the `f32` lanes from `a` using an immediate control value. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn permute_m128(a: m128) -> m128 { m128(unsafe { _mm_permute_ps(a.0, MASK) }) } /// Shuffle the `f32` lanes in `a` using an immediate control value. /// /// * **Intrinsic:** [`_mm256_permute_ps`] /// * **Assembly:** `vpermilps ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn permute_m256(a: m256) -> m256 { m256(unsafe { _mm256_permute_ps(a.0, MASK) }) } /// Shuffle 128 bits of floating point data at a time from `a` and `b` using an /// immediate control value. /// /// Each output selection is 4-bit wide, if `1000` is passed, that output is /// zeroed instead of picking from `a` or `b`. /// /// * **Intrinsic:** [`_mm256_permute2f128_pd`] /// * **Assembly:** `vperm2f128 ymm, ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn permute2z_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_permute2f128_pd(a.0, b.0, MASK) }) } /// Shuffle 128 bits of floating point data at a time from `$a` and `$b` using /// an immediate control value. /// /// Each output selection is 4-bit wide, if `1000` is passed, that output is /// zeroed instead of picking from `a` or `b`. /// /// * **Intrinsic:** [`_mm256_permute2f128_ps`] /// * **Assembly:** `vperm2f128 ymm, ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn permute2z_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_permute2f128_ps(a.0, b.0, MASK) }) } /// *Slowly* swizzle 128 bits of integer data from `a` and `b` using an /// immediate control value. /// /// Each output selection is 4-bit wide, if `1000` is passed, that output is /// zeroed instead of picking from `a` or `b`. /// /// If `avx2` is available you should use [`shuffle_abi_i128z_all_m256i`] /// instead. Only use this if you're targeting `avx` but not `avx2`. /// /// * **Intrinsic:** [`_mm256_permute2f128_si256`] /// * **Assembly:** `vperm2f128 ymm, ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn permute2z_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_permute2f128_si256(a.0, b.0, MASK) }) } /// Shuffle `f64` lanes in `a` using **bit 1** of the `i64` lanes in `v` /// /// * **Intrinsic:** [`_mm_permutevar_pd`] /// * **Assembly:** `vpermilpd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn shuffle_av_f64_all_m128d(a: m128d, v: m128i) -> m128d { m128d(unsafe { _mm_permutevar_pd(a.0, v.0) }) } /// Shuffle `f64` lanes in `a` using **bit 1** of the `i64` lanes in `v`. /// /// Each lane selection value picks only within that 128-bit half of the overall /// register. /// /// * **Intrinsic:** [`_mm256_permutevar_pd`] /// * **Assembly:** `vpermilpd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn shuffle_av_f64_half_m256d(a: m256d, b: m256i) -> m256d { m256d(unsafe { _mm256_permutevar_pd(a.0, b.0) }) } /// Shuffle `f32` values in `a` using `i32` values in `v`. /// /// * **Intrinsic:** [`_mm_permutevar_ps`] /// * **Assembly:** `vpermilps xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn shuffle_av_f32_all_m128(a: m128, v: m128i) -> m128 { m128(unsafe { _mm_permutevar_ps(a.0, v.0) }) } /// Shuffle `f32` values in `a` using `i32` values in `v`. /// /// Each lane selection value picks only within that 128-bit half of the overall /// register. /// /// * **Intrinsic:** [`_mm256_permutevar_ps`] /// * **Assembly:** `vpermilps ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn shuffle_av_f32_half_m256(a: m256, v: m256i) -> m256 { m256(unsafe { _mm256_permutevar_ps(a.0, v.0) }) } /// Reciprocal of `f32` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn reciprocal_m256(a: m256) -> m256 { m256(unsafe { _mm256_rcp_ps(a.0) }) } /// Rounds each lane in the style specified. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn round_m256d(a: m256d) -> m256d { m256d(unsafe { _mm256_round_pd(a.0, OP) }) } /// Rounds each lane in the style specified. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn round_m256(a: m256) -> m256 { m256(unsafe { _mm256_round_ps(a.0, OP) }) } /// Reciprocal of `f32` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn reciprocal_sqrt_m256(a: m256) -> m256 { m256(unsafe { _mm256_rsqrt_ps(a.0) }) } /// Set `i8` args into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_i8_m256i( e31: i8, e30: i8, e29: i8, e28: i8, e27: i8, e26: i8, e25: i8, e24: i8, e23: i8, e22: i8, e21: i8, e20: i8, e19: i8, e18: i8, e17: i8, e16: i8, e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8, e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8 ) -> m256i { m256i(unsafe { _mm256_set_epi8( e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0 ) }) } /// Set `i16` args into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_i16_m256i( e15: i16, e14: i16, e13: i16, e12: i16, e11: i16, e10: i16, e9: i16, e8: i16, e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, ) -> m256i { m256i(unsafe { _mm256_set_epi16( e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, ) }) } /// Set `i32` args into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_i32_m256i( e7: i32, e6: i32, e5: i32, e4: i32, e3: i32, e2: i32, e1: i32, e0: i32, ) -> m256i { m256i(unsafe { _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0) }) } /// Set `i64` args into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[cfg(target_arch = "x86_64")] pub fn set_i64_m256i(e3: i64, e2: i64, e1: i64, e0: i64) -> m256i { m256i(unsafe { _mm256_set_epi64x(e3, e2, e1, e0) }) } /// Set `m128` args into an `m256`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn set_m128_m256(high: m128, low: m128) -> m256 { m256(unsafe { _mm256_set_m128(high.0, low.0) }) } /// Set `m128d` args into an `m256d`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_m128d_m256d( high: m128d, low: m128d ) -> m256d { m256d(unsafe { _mm256_set_m128d(high.0, low.0) }) } /// Set `m128i` args into an `m256i`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_m128i_m256i( hi: m128i, lo: m128i ) -> m256i { m256i(unsafe { _mm256_set_m128i(hi.0, lo.0) }) } /// Set `f64` args into an `m256d` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_m256d( e3: f64, e2: f64, e1: f64, e0: f64, ) -> m256d { m256d(unsafe { _mm256_set_pd(e3, e2, e1, e0) }) } /// Set `f32` args into an `m256` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_m256( e7: f32, e6: f32, e5: f32, e4: f32, e3: f32, e2: f32, e1: f32, e0: f32, ) -> m256 { m256(unsafe { _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0) }) } /// Splat an `i8` arg into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn set_splat_i8_m256i(i: i8) -> m256i { m256i(unsafe { _mm256_set1_epi8(i) }) } /// Splat an `i16` arg into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn set_splat_i16_m256i(i: i16) -> m256i { m256i(unsafe { _mm256_set1_epi16(i) }) } /// Splat an `i32` arg into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn set_splat_i32_m256i(i: i32) -> m256i { m256i(unsafe { _mm256_set1_epi32(i) }) } /// Splat an `i64` arg into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn set_splat_i64_m256i(i: i64) -> m256i { m256i(unsafe { _mm256_set1_epi64x(i) }) } /// Splat an `f64` arg into an `m256d` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn set_splat_m256d(f: f64) -> m256d { m256d(unsafe { _mm256_set1_pd(f) }) } /// Splat an `f32` arg into an `m256` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_splat_m256( f: f32, ) -> m256 { m256(unsafe { _mm256_set1_ps(f) }) } /// Set `i8` args into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_reversed_i8_m256i( e31: i8, e30: i8, e29: i8, e28: i8, e27: i8, e26: i8, e25: i8, e24: i8, e23: i8, e22: i8, e21: i8, e20: i8, e19: i8, e18: i8, e17: i8, e16: i8, e15: i8, e14: i8, e13: i8, e12: i8, e11: i8, e10: i8, e9: i8, e8: i8, e7: i8, e6: i8, e5: i8, e4: i8, e3: i8, e2: i8, e1: i8, e0: i8 ) -> m256i { m256i(unsafe { _mm256_setr_epi8( e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0 ) }) } /// Set `i16` args into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_reversed_i16_m256i( e15: i16, e14: i16, e13: i16, e12: i16, e11: i16, e10: i16, e9: i16, e8: i16, e7: i16, e6: i16, e5: i16, e4: i16, e3: i16, e2: i16, e1: i16, e0: i16, ) -> m256i { m256i(unsafe { _mm256_setr_epi16( e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0, ) }) } /// Set `i32` args into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_reversed_i32_m256i( e7: i32, e6: i32, e5: i32, e4: i32, e3: i32, e2: i32, e1: i32, e0: i32, ) -> m256i { m256i(unsafe { _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) }) } /// Set `i64` args into an `m256i` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn set_reversed_i64_m256i(e3: i64, e2: i64, e1: i64, e0: i64) -> m256i { m256i(unsafe { _mm256_setr_epi64x(e3, e2, e1, e0) }) } /// Set `m128` args into an `m256`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn set_reversed_m128_m256(hi: m128, lo: m128) -> m256 { m256(unsafe { _mm256_setr_m128(hi.0, lo.0) }) } /// Set `m128d` args into an `m256d`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_reversed_m128d_m256d( hi: m128d, lo: m128d ) -> m256d { m256d(unsafe { _mm256_setr_m128d(hi.0, lo.0) }) } /// Set `m128i` args into an `m256i`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_reversed_m128i_m256i( hi: m128i, lo: m128i ) -> m256i { m256i(unsafe { _mm256_setr_m128i(hi.0, lo.0) }) } /// Set `f64` args into an `m256d` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_reversed_m256d( e3: f64, e2: f64, e1: f64, e0: f64, ) -> m256d { m256d(unsafe { _mm256_setr_pd(e3, e2, e1, e0) }) } /// Set `f32` args into an `m256` lane. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] #[rustfmt::skip] pub fn set_reversed_m256( e7: f32, e6: f32, e5: f32, e4: f32, e3: f32, e2: f32, e1: f32, e0: f32, ) -> m256 { m256(unsafe { _mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0) }) } /// A zeroed `m256d` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn zeroed_m256d() -> m256d { m256d(unsafe { _mm256_setzero_pd() }) } /// A zeroed `m256` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn zeroed_m256() -> m256 { m256(unsafe { _mm256_setzero_ps() }) } /// A zeroed `m256i` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn zeroed_m256i() -> m256i { m256i(unsafe { _mm256_setzero_si256() }) } /// Shuffle the `f64` lanes from `a` and `b` together using an immediate control /// value. /// /// The control value uses the lowest 4 bits only. /// * bit 0 picks between lanes 0 or 1 from A. /// * bit 1 picks between lanes 0 or 1 from B. /// * bit 2 picks between lanes 2 or 3 from A. /// * bit 3 picks between lanes 2 or 3 from B. /// /// Note that this shuffle cannot move data between the lower half of the lanes /// and the upper half of the lanes. /// /// * **Intrinsic:** [`_mm256_shuffle_pd`] /// * **Assembly:** `vshufpd ymm, ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn shuffle_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_shuffle_pd(a.0, b.0, IMM) }) } /// Shuffle the `f32` lanes from `a` and `b` together using an immediate /// control value. /// /// This works like [`shuffle_abi_f32_all_m128`], but with the low 128 bits and /// high 128 bits each doing a shuffle at the same time. Each index (`0..=3`) /// only refers to a lane within a given 128 bit portion of the 256 bit inputs. /// You cannot cross data between the two 128 bit halves. /// /// * **Intrinsic:** [`_mm256_shuffle_ps`] /// * **Assembly:** `vshufps ymm, ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn shuffle_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_shuffle_ps(a.0, b.0, IMM) }) } /// Lanewise `sqrt` on `f64` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn sqrt_m256d(a: m256d) -> m256d { m256d(unsafe { _mm256_sqrt_pd(a.0) }) } /// Lanewise `sqrt` on `f64` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn sqrt_m256(a: m256) -> m256 { m256(unsafe { _mm256_sqrt_ps(a.0) }) } /// Store data from a register into memory. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_m256d(addr: &mut m256d, a: m256d) { unsafe { _mm256_store_pd(addr as *mut m256d as *mut f64, a.0) } } /// Store data from a register into memory. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_m256(addr: &mut m256, a: m256) { unsafe { _mm256_store_ps(addr as *mut m256 as *mut f32, a.0) } } /// Store data from a register into memory. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_m256i(addr: &mut m256i, a: m256i) { unsafe { _mm256_store_si256(addr as *mut m256i as *mut __m256i, a.0) } } /// Store data from a register into memory. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_unaligned_m256d(addr: &mut [f64; 4], a: m256d) { unsafe { _mm256_storeu_pd(addr.as_mut_ptr(), a.0) } } /// Store data from a register into memory. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_unaligned_m256(addr: &mut [f32; 8], a: m256) { unsafe { _mm256_storeu_ps(addr.as_mut_ptr(), a.0) } } /// Store data from a register into memory. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_unaligned_m256i(addr: &mut [i8; 32], a: m256i) { unsafe { _mm256_storeu_si256(addr as *mut [i8; 32] as *mut __m256i, a.0) } } /// Store data from a register into memory. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_unaligned_hi_lo_m256d(hi_addr: &mut [f64; 2], lo_addr: &mut [f64; 2], a: m256d) { unsafe { _mm256_storeu2_m128d(hi_addr.as_mut_ptr(), lo_addr.as_mut_ptr(), a.0) } } /// Store data from a register into memory. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_unaligned_hi_lo_m256(hi_addr: &mut [f32; 4], lo_addr: &mut [f32; 4], a: m256) { unsafe { _mm256_storeu2_m128(hi_addr.as_mut_ptr(), lo_addr.as_mut_ptr(), a.0) } } /// Store data from a register into memory. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn store_unaligned_hi_lo_m256i(hi_addr: &mut [i8; 16], lo_addr: &mut [i8; 16], a: m256i) { unsafe { _mm256_storeu2_m128i(hi_addr.as_mut_ptr().cast(), lo_addr.as_mut_ptr().cast(), a.0) } } /// Lanewise `a - b` with `f64` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn sub_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_sub_pd(a.0, b.0) }) } /// Lanewise `a - b` with `f32` lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn sub_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_sub_ps(a.0, b.0) }) } /// Unpack and interleave the high lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn unpack_hi_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_unpackhi_pd(a.0, b.0) }) } /// Unpack and interleave the high lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn unpack_hi_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_unpackhi_ps(a.0, b.0) }) } /// Unpack and interleave the high lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn unpack_lo_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_unpacklo_pd(a.0, b.0) }) } /// Unpack and interleave the high lanes. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn unpack_lo_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_unpacklo_ps(a.0, b.0) }) } /// Bitwise `a ^ b`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn bitxor_m256d(a: m256d, b: m256d) -> m256d { m256d(unsafe { _mm256_xor_pd(a.0, b.0) }) } /// Bitwise `a ^ b`. /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn bitxor_m256(a: m256, b: m256) -> m256 { m256(unsafe { _mm256_xor_ps(a.0, b.0) }) } /// Zero extend an `m128d` to `m256d` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn zero_extend_m128d(a: m128d) -> m256d { m256d(unsafe { _mm256_zextpd128_pd256(a.0) }) } /// Zero extend an `m128` to `m256` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn zero_extend_m128(a: m128) -> m256 { m256(unsafe { _mm256_zextps128_ps256(a.0) }) } /// Zero extend an `m128i` to `m256i` /// /// * **Intrinsic:** [``] /// * **Assembly:** #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx")))] pub fn zero_extend_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_zextsi128_si256(a.0) }) } impl Add for m256d { type Output = Self; #[must_use] #[inline(always)] fn add(self, rhs: Self) -> Self { add_m256d(self, rhs) } } impl AddAssign for m256d { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = *self + rhs; } } impl BitAnd for m256d { type Output = Self; #[must_use] #[inline(always)] fn bitand(self, rhs: Self) -> Self { bitand_m256d(self, rhs) } } impl BitAndAssign for m256d { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = *self & rhs; } } impl BitOr for m256d { type Output = Self; #[must_use] #[inline(always)] fn bitor(self, rhs: Self) -> Self { bitor_m256d(self, rhs) } } impl BitOrAssign for m256d { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = *self | rhs; } } impl BitXor for m256d { type Output = Self; #[must_use] #[inline(always)] fn bitxor(self, rhs: Self) -> Self { bitxor_m256d(self, rhs) } } impl BitXorAssign for m256d { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = *self ^ rhs; } } impl Div for m256d { type Output = Self; #[must_use] #[inline(always)] fn div(self, rhs: Self) -> Self { div_m256d(self, rhs) } } impl DivAssign for m256d { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = *self / rhs; } } impl Mul for m256d { type Output = Self; #[must_use] #[inline(always)] fn mul(self, rhs: Self) -> Self { mul_m256d(self, rhs) } } impl MulAssign for m256d { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = *self * rhs; } } impl Neg for m256d { type Output = Self; #[must_use] #[inline(always)] fn neg(self) -> Self { sub_m256d(zeroed_m256d(), self) } } impl Not for m256d { type Output = Self; /// Not a direct intrinsic, but it's very useful and the implementation is /// simple enough. /// /// Negates the bits by performing an `xor` with an all-ones bit pattern. #[must_use] #[inline(always)] fn not(self) -> Self { let all_bits = set_splat_m256d(f64::from_bits(u64::MAX)); self ^ all_bits } } impl Sub for m256d { type Output = Self; #[must_use] #[inline(always)] fn sub(self, rhs: Self) -> Self { sub_m256d(self, rhs) } } impl SubAssign for m256d { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = *self - rhs; } } impl PartialEq for m256d { /// Performs a comparison to get a mask, then moves the mask and checks for /// all true. #[must_use] #[inline(always)] fn eq(&self, other: &Self) -> bool { let mask = m256d(unsafe { _mm256_cmp_pd(self.0, other.0, _CMP_EQ_OQ) }); move_mask_m256d(mask) == 0b1111 } } impl Add for m256 { type Output = Self; #[must_use] #[inline(always)] fn add(self, rhs: Self) -> Self { add_m256(self, rhs) } } impl AddAssign for m256 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = *self + rhs; } } impl BitAnd for m256 { type Output = Self; #[must_use] #[inline(always)] fn bitand(self, rhs: Self) -> Self { bitand_m256(self, rhs) } } impl BitAndAssign for m256 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = *self & rhs; } } impl BitOr for m256 { type Output = Self; #[must_use] #[inline(always)] fn bitor(self, rhs: Self) -> Self { bitor_m256(self, rhs) } } impl BitOrAssign for m256 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = *self | rhs; } } impl BitXor for m256 { type Output = Self; #[must_use] #[inline(always)] fn bitxor(self, rhs: Self) -> Self { bitxor_m256(self, rhs) } } impl BitXorAssign for m256 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = *self ^ rhs; } } impl Div for m256 { type Output = Self; #[must_use] #[inline(always)] fn div(self, rhs: Self) -> Self { div_m256(self, rhs) } } impl DivAssign for m256 { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = *self / rhs; } } impl Mul for m256 { type Output = Self; #[must_use] #[inline(always)] fn mul(self, rhs: Self) -> Self { mul_m256(self, rhs) } } impl MulAssign for m256 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = *self * rhs; } } impl Neg for m256 { type Output = Self; #[must_use] #[inline(always)] fn neg(self) -> Self { sub_m256(zeroed_m256(), self) } } impl Not for m256 { type Output = Self; /// Not a direct intrinsic, but it's very useful and the implementation is /// simple enough. /// /// Negates the bits by performing an `xor` with an all-ones bit pattern. #[must_use] #[inline(always)] fn not(self) -> Self { let all_bits = set_splat_m256(f32::from_bits(u32::MAX)); self ^ all_bits } } impl Sub for m256 { type Output = Self; #[must_use] #[inline(always)] fn sub(self, rhs: Self) -> Self { sub_m256(self, rhs) } } impl SubAssign for m256 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = *self - rhs; } } impl PartialEq for m256 { /// Performs a comparison to get a mask, then moves the mask and checks for /// all true. #[must_use] #[inline(always)] fn eq(&self, other: &Self) -> bool { let mask = m256(unsafe { _mm256_cmp_ps(self.0, other.0, _CMP_EQ_OQ) }); move_mask_m256(mask) == 0b1111_1111 } } safe_arch-0.7.1/src/x86_x64/avx2.rs000066400000000000000000003140101445526200400165760ustar00rootroot00000000000000#![cfg(target_feature = "avx2")] use super::*; /// Blends the `i32` lanes in `a` and `b` into a single value. /// /// * The blend is controlled by an immediate mask value (an `i32`). /// * For each lane `0..=3`, use `0` if you want that lane of the output to be /// from `a` and use `1` if you want that lane of the output to be from `b`. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([10, 20, 30, 40]); /// let b = m128i::from([100, 200, 300, 400]); /// // /// let c: [i32; 4] = blend_imm_i32_m128i::<0b0110>(a, b).into(); /// assert_eq!(c, [10, 200, 300, 40]); /// ``` /// * **Intrinsic:** [`_mm_blend_epi32`] /// * **Assembly:** `vpblendd xmm, xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn blend_imm_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_blend_epi32(a.0, b.0, IMM) }) } /// Splat the lowest 8-bit lane across the entire 128 bits. /// ``` /// # use safe_arch::*; /// let a = m128i::from(0x77_i128); /// let b: [i8; 16] = splat_i8_m128i_s_m128i(a).into(); /// assert_eq!(b, [0x77_i8; 16]); /// ``` /// * **Intrinsic:** [`_mm_broadcastb_epi8`] /// * **Assembly:** `vpbroadcastb xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn splat_i8_m128i_s_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_broadcastb_epi8(a.0) }) } /// Splat the lowest 16-bit lane across the entire 128 bits. /// ``` /// # use safe_arch::*; /// let a = m128i::from(0x77_i128); /// let b: [i16; 8] = splat_i16_m128i_s_m128i(a).into(); /// assert_eq!(b, [0x77_i16; 8]); /// ``` /// * **Intrinsic:** [`_mm_broadcastw_epi16`] /// * **Assembly:** `vpbroadcastw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn splat_i16_m128i_s_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_broadcastw_epi16(a.0) }) } /// Splat the lowest 32-bit lane across the entire 128 bits. /// ``` /// # use safe_arch::*; /// let a = m128i::from(0x77_i128); /// let b: [i32; 4] = splat_i32_m128i_s_m128i(a).into(); /// assert_eq!(b, [0x77; 4]); /// ``` /// * **Intrinsic:** [`_mm_broadcastd_epi32`] /// * **Assembly:** `vpbroadcastd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn splat_i32_m128i_s_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_broadcastd_epi32(a.0) }) } /// Splat the lowest 64-bit lane across the entire 128 bits. /// ``` /// # use safe_arch::*; /// let a = m128i::from(0x77_i128); /// let b: [i64; 2] = splat_i64_m128i_s_m128i(a).into(); /// assert_eq!(b, [0x77_i64; 2]); /// ``` /// * **Intrinsic:** [`_mm_broadcastq_epi64`] /// * **Assembly:** `vpbroadcastq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn splat_i64_m128i_s_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_broadcastq_epi64(a.0) }) } /// Splat the lower `f64` across both lanes of `m128d`. /// ``` /// # use safe_arch::*; /// let a = m128d::from([1.0, 2.0]); /// let b = splat_m128d_s_m128d(a).to_array(); /// assert_eq!(b, [1.0, 1.0]); /// ``` /// * **Intrinsic:** [`_mm_broadcastsd_pd`] /// * **Assembly:** `movddup xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn splat_m128d_s_m128d(a: m128d) -> m128d { m128d(unsafe { _mm_broadcastsd_pd(a.0) }) } /// Splat the 128-bits across 256-bits. /// ``` /// # use safe_arch::*; /// let a = m128i::from(1_i128); /// let b: [i128; 2] = splat_m128i_m256i(a).into(); /// assert_eq!(b, [1_i128, 1]); /// ``` /// * **Intrinsic:** [`_mm256_broadcastsi128_si256`] /// * **Assembly:** `vbroadcasti128 ymm, m128` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn splat_m128i_m256i(a: m128i) -> m256i { m256i(unsafe { _mm256_broadcastsi128_si256(a.0) }) } /// Splat the lowest `f32` across all four lanes. /// ``` /// # use safe_arch::*; /// let a = set_m128_s(1.0); /// let b = splat_m128_s_m128(a).to_array(); /// assert_eq!(b, [1.0, 1.0, 1.0, 1.0]); /// ``` /// * **Intrinsic:** [`_mm_broadcastss_ps`] /// * **Assembly:** `vbroadcastss xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn splat_m128_s_m128(a: m128) -> m128 { m128(unsafe { _mm_broadcastss_ps(a.0) }) } /// Loads the reference given and zeroes any `i32` lanes not in the mask. /// /// * A lane is "in" the mask if that lane's mask value is set in the high bit /// (aka "if the lane's value is negative"). /// ``` /// # use safe_arch::*; /// let a = set_splat_i32_m128i(5); /// let b = load_masked_i32_m128i(&a, m128i::from([-1_i32, 0, 0, -1])); /// assert_eq!(<[i32; 4]>::from(b), [5, 0, 0, 5]); /// ``` /// * **Intrinsic:** [`_mm_maskload_epi32`] /// * **Assembly:** `vpmaskmovd xmm, xmm, m128` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn load_masked_i32_m128i(a: &m128i, mask: m128i) -> m128i { m128i(unsafe { _mm_maskload_epi32(a as *const m128i as *const i32, mask.0) }) } /// Loads the reference given and zeroes any `i64` lanes not in the mask. /// /// * A lane is "in" the mask if that lane's mask value is set in the high bit /// (aka "if the lane's value is negative"). /// ``` /// # use safe_arch::*; /// let a = set_splat_i64_m128i(5); /// let b = load_masked_i64_m128i(&a, m128i::from([0_i64, -1])); /// assert_eq!(<[i64; 2]>::from(b), [0_i64, 5]); /// ``` /// * **Intrinsic:** [`_mm_maskload_epi64`] /// * **Assembly:** `vpmaskmovq xmm, xmm, m128` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn load_masked_i64_m128i(a: &m128i, mask: m128i) -> m128i { m128i(unsafe { _mm_maskload_epi64(a as *const m128i as *const i64, mask.0) }) } /// Stores the `i32` masked lanes given to the reference. /// /// * A lane is "in" the mask if that lane's mask value is set in the high bit /// (aka "if the lane's value is negative"). /// * Lanes not in the mask are not modified. /// ``` /// # use safe_arch::*; /// let mut a = m128i::default(); /// store_masked_i32_m128i(&mut a, m128i::from([-1_i32, 0, 0, -1]), set_splat_i32_m128i(5)); /// assert_eq!(<[i32; 4]>::from(a), [5, 0, 0, 5]); /// ``` /// * **Intrinsic:** [`_mm_maskstore_epi32`] /// * **Assembly:** `vpmaskmovd m128, xmm, xmm` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn store_masked_i32_m128i(addr: &mut m128i, mask: m128i, a: m128i) { unsafe { _mm_maskstore_epi32(addr as *mut m128i as *mut i32, mask.0, a.0) }; } /// Stores the `i32` masked lanes given to the reference. /// /// * A lane is "in" the mask if that lane's mask value is set in the high bit /// (aka "if the lane's value is negative"). /// * Lanes not in the mask are not modified. /// ``` /// # use safe_arch::*; /// let mut a = m128i::default(); /// store_masked_i64_m128i(&mut a, m128i::from([0_i64, -1]), set_splat_i64_m128i(5)); /// assert_eq!(<[i64; 2]>::from(a), [0, 5]); /// ``` /// * **Intrinsic:** [`_mm_maskstore_epi64`] /// * **Assembly:** `vpmaskmovq m128, xmm, xmm` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn store_masked_i64_m128i(addr: &mut m128i, mask: m128i, a: m128i) { unsafe { _mm_maskstore_epi64(addr as *mut m128i as *mut i64, mask.0, a.0) }; } /// Shift `u32` values to the left by `count` bits. /// /// * Each `u32` lane in `a` is shifted by the same indexed `u32` lane in /// `count`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let count = m128i::from([5, 6, 7, 8]); /// let out: [u32; 4] = shl_each_u32_m128i(a, count).into(); /// assert_eq!(out, [1 << 5, 2 << 6, 3 << 7, 4 << 8]); /// ``` /// * **Intrinsic:** [`_mm_sllv_epi32`] /// * **Assembly:** `vpsllvd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_each_u32_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_sllv_epi32(a.0, count.0) }) } /// Shift `u64` values to the left by `count` bits. /// /// * Each `u64` lane in `a` is shifted by the same indexed `u64` lane in /// `count`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u64, 2]); /// let count = m128i::from([3_u64, 4]); /// let out: [u64; 2] = shl_each_u64_m128i(a, count).into(); /// assert_eq!(out, [1_u64 << 3, 2 << 4]); /// ``` /// * **Intrinsic:** [`_mm_sllv_epi64`] /// * **Assembly:** `vpsllvq xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_each_u64_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_sllv_epi64(a.0, count.0) }) } /// Shift `i32` values to the right by `count` bits. /// /// * Each `i32` lane in `a` is shifted by the same indexed `u32` lane in /// `count`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([100, 110, 120, -130]); /// let count = m128i::from([1, 2, 3, 4]); /// let out: [i32; 4] = shr_each_i32_m128i(a, count).into(); /// assert_eq!(out, [100 >> 1, 110 >> 2, 120 >> 3, (-130) >> 4]); /// ``` /// * **Intrinsic:** [`_mm_srav_epi32`] /// * **Assembly:** `vpsravd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_each_i32_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_srav_epi32(a.0, count.0) }) } /// Shift `u32` values to the left by `count` bits. /// /// * Each `u32` lane in `a` is shifted by the same indexed `u32` lane in /// `count`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([100, 110, 120, 130]); /// let count = m128i::from([1, 2, 3, 4]); /// let out: [u32; 4] = shr_each_u32_m128i(a, count).into(); /// assert_eq!(out, [100 >> 1, 110 >> 2, 120 >> 3, 130 >> 4]); /// ``` /// * **Intrinsic:** [`_mm_srlv_epi32`] /// * **Assembly:** `vpsrlvd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_each_u32_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_srlv_epi32(a.0, count.0) }) } /// Shift `u64` values to the left by `count` bits. /// /// * Each `u64` lane in `a` is shifted by the same indexed `u64` lane in /// `count`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([100_u64, 110]); /// let count = m128i::from([1_u64, 2]); /// let out: [u64; 2] = shr_each_u64_m128i(a, count).into(); /// assert_eq!(out, [100_u64 >> 1, 110 >> 2]); /// ``` /// * **Intrinsic:** [`_mm_srlv_epi64`] /// * **Assembly:** `vpsrlvq xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_each_u64_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_srlv_epi64(a.0, count.0) }) } /// Absolute value of `i8` lanes. /// ``` /// # use safe_arch::*; /// let a = m256i::from([-7_i8; 32]); /// let b: [i8; 32] = abs_i8_m256i(a).into(); /// assert_eq!(b, [7_i8; 32]); /// ``` /// * **Intrinsic:** [`_mm256_abs_epi8`] /// * **Assembly:** `vpabsb ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn abs_i8_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_abs_epi8(a.0) }) } /// Absolute value of `i16` lanes. /// ``` /// # use safe_arch::*; /// let a = m256i::from([-7_i16; 16]); /// let b: [i16; 16] = abs_i16_m256i(a).into(); /// assert_eq!(b, [7_i16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_abs_epi16`] /// * **Assembly:** `vpabsw ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn abs_i16_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_abs_epi16(a.0) }) } /// Absolute value of `i32` lanes. /// ``` /// # use safe_arch::*; /// let a = m256i::from([-7_i32; 8]); /// let b: [i32; 8] = abs_i32_m256i(a).into(); /// assert_eq!(b, [7_i32; 8]); /// ``` /// * **Intrinsic:** [`_mm256_abs_epi32`] /// * **Assembly:** `vpabsd ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn abs_i32_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_abs_epi32(a.0) }) } /// Lanewise `a + b` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i8; 32]); /// let b = m256i::from([10_i8; 32]); /// let c: [i8; 32] = add_i8_m256i(a, b).into(); /// assert_eq!(c, [15_i8; 32]); /// ``` /// * **Intrinsic:** [`_mm256_add_epi8`] /// * **Assembly:** `vpaddb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_add_epi8(a.0, b.0) }) } /// Lanewise `a + b` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16; 16]); /// let b = m256i::from([10_i16; 16]); /// let c: [i16; 16] = add_i16_m256i(a, b).into(); /// assert_eq!(c, [15_i16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_add_epi16`] /// * **Assembly:** `vpaddw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_add_epi16(a.0, b.0) }) } /// Lanewise `a + b` with lanes as `i32`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i32; 8]); /// let b = m256i::from([10_i32; 8]); /// let c: [i32; 8] = add_i32_m256i(a, b).into(); /// assert_eq!(c, [15_i32; 8]); /// ``` /// * **Intrinsic:** [`_mm256_add_epi32`] /// * **Assembly:** `vpaddd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_add_epi32(a.0, b.0) }) } /// Lanewise `a + b` with lanes as `i64`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i64; 4]); /// let b = m256i::from([10_i64; 4]); /// let c: [i64; 4] = add_i64_m256i(a, b).into(); /// assert_eq!(c, [15_i64; 4]); /// ``` /// * **Intrinsic:** [`_mm256_add_epi64`] /// * **Assembly:** `vpaddq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_i64_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_add_epi64(a.0, b.0) }) } /// Lanewise saturating `a + b` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([126_i8; 32]); /// let b = m256i::from([125_i8; 32]); /// let c: [i8; 32] = add_saturating_i8_m256i(a, b).into(); /// assert_eq!(c, [127_i8; 32]); /// ``` /// * **Intrinsic:** [`_mm256_adds_epi8`] /// * **Assembly:** `vpaddsb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_saturating_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_adds_epi8(a.0, b.0) }) } /// Lanewise saturating `a + b` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([32700_i16; 16]); /// let b = m256i::from([32000_i16; 16]); /// let c: [i16; 16] = add_saturating_i16_m256i(a, b).into(); /// assert_eq!(c, [32767_i16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_adds_epi16`] /// * **Assembly:** `vpaddsw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_saturating_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_adds_epi16(a.0, b.0) }) } /// Lanewise saturating `a + b` with lanes as `u8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([126_u8; 32]); /// let b = m256i::from([125_u8; 32]); /// let c: [u8; 32] = add_saturating_u8_m256i(a, b).into(); /// assert_eq!(c, [251_u8; 32]); /// ``` /// * **Intrinsic:** [`_mm256_adds_epu8`] /// * **Assembly:** `vpaddusb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_saturating_u8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_adds_epu8(a.0, b.0) }) } /// Lanewise saturating `a + b` with lanes as `u16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([32700_u16; 16]); /// let b = m256i::from([32000_u16; 16]); /// let c: [u16; 16] = add_saturating_u16_m256i(a, b).into(); /// assert_eq!(c, [64700_u16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_adds_epu16`] /// * **Assembly:** `vpaddusw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_saturating_u16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_adds_epu16(a.0, b.0) }) } /// Works like [`combined_byte_shr_imm_m128i`], but twice as wide. /// /// The low half of the bytes and high half of the bytes are both processed /// separately. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i8; 32]); /// let b = m256i::from([12_i8; 32]); /// // `a` bytes come in to the _high_ indexes because these are LE bytes. /// // Also note that the three 5 values at the low half and high half. /// let c: [i8; 32] = combined_byte_shr_imm_m256i::<3>(a, b).into(); /// assert_eq!( /// c, /// [ /// 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 5, 5, 5, 12, 12, 12, 12, 12, 12, 12, 12, /// 12, 12, 12, 12, 12, 5, 5, 5_i8 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_alignr_epi8`] /// * **Assembly:** `vpalignr ymm, ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn combined_byte_shr_imm_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_alignr_epi8(a.0, b.0, IMM) }) } /// Bitwise `a & b`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i64, 0, 1, 1]); /// let b = m256i::from([0_i64, 1, 0, 1]); /// let c: [i64; 4] = bitand_m256i(a, b).into(); /// assert_eq!(c, [0_i64, 0, 0, 1]); /// ``` /// * **Intrinsic:** [`_mm256_and_si256`] /// * **Assembly:** `vpand ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn bitand_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_and_si256(a.0, b.0) }) } /// Bitwise `(!a) & b`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i64, 0, 1, 1]); /// let b = m256i::from([0_i64, 1, 0, 1]); /// let c: [i64; 4] = bitandnot_m256i(a, b).into(); /// assert_eq!(c, [0_i64, 1, 0, 0]); /// ``` /// * **Intrinsic:** [`_mm256_andnot_si256`] /// * **Assembly:** `vpandn ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn bitandnot_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_andnot_si256(a.0, b.0) }) } /// Average `u8` lanes. /// ``` /// # use safe_arch::*; /// let a = m256i::from([100_u8; 32]); /// let b = m256i::from([120_u8; 32]); /// let c: [u8; 32] = average_u8_m256i(a, b).into(); /// assert_eq!(c, [110_u8; 32]); /// ``` /// * **Intrinsic:** [`_mm256_avg_epu8`] /// * **Assembly:** `vpavgb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn average_u8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_avg_epu8(a.0, b.0) }) } /// Average `u16` lanes. /// ``` /// # use safe_arch::*; /// let a = m256i::from([100_u16; 16]); /// let b = m256i::from([120_u16; 16]); /// let c: [u16; 16] = average_u16_m256i(a, b).into(); /// assert_eq!(c, [110_u16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_avg_epu16`] /// * **Assembly:** `vpavgw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn average_u16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_avg_epu16(a.0, b.0) }) } /// Blends the `i16` lanes according to the immediate value. /// /// * The low 8 lanes and high 8 lanes both use the same immediate. /// * Each bit in `0..=7` should be set for `$b` and unset for `$a` within that /// half of the `i16` values. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16; 16]); /// let b = m256i::from([10_i16; 16]); /// // /// let c: [i16; 16] = blend_imm_i16_m256i::<0b11001000>(a, b).into(); /// assert_eq!(c, [5_i16, 5, 5, 10, 5, 5, 10, 10, 5, 5, 5, 10, 5, 5, 10, 10]); /// ``` /// * **Intrinsic:** [`_mm256_blend_epi16`] /// * **Assembly:** `vpblendw ymm, ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn blend_imm_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_blend_epi16(a.0, b.0, IMM) }) } /// Blends the `i32` lanes according to the immediate value. /// /// * Each bit in `0..=7` should be set for `$b` and unset for `$a` /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i32; 8]); /// let b = m256i::from([10_i32; 8]); /// // /// let c: [i32; 8] = blend_imm_i32_m256i::<0b11001000>(a, b).into(); /// assert_eq!(c, [5, 5, 5, 10, 5, 5, 10, 10]); /// ``` /// * **Intrinsic:** [`_mm256_blend_epi32`] /// * **Assembly:** `vpblendd ymm, ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn blend_imm_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_blend_epi32(a.0, b.0, IMM) }) } /// Blend `i8` lanes according to a runtime varying mask. /// /// * Mask lanes should be non-negative for `a` and negative for `b`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i8; 32]); /// let b = m256i::from([10_i8; 32]); /// let mask = m256i::from([ /// 0_i8, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0, /// -1, -1, -1, 0, 0, /// ]); /// let c: [i8; 32] = blend_varying_i8_m256i(a, b, mask).into(); /// assert_eq!( /// c, /// [ /// 5, 5, 5, 10, 10, 10, 5, 5, 5, 10, 10, 10, 5, 5, 5, 10, 10, 10, 5, 5, 5, 10, 10, 10, 5, 5, 5, /// 10, 10, 10, 5, 5 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_blendv_epi8`] /// * **Assembly:** `vpavgw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn blend_varying_i8_m256i(a: m256i, b: m256i, mask: m256i) -> m256i { m256i(unsafe { _mm256_blendv_epi8(a.0, b.0, mask.0) }) } /// Sets the lowest `i8` lane of an `m128i` as all lanes of an `m256i`. /// ``` /// # use safe_arch::*; /// let a = m128i::from(5_i8 as i128); /// let b: [i8; 32] = set_splat_i8_m128i_s_m256i(a).into(); /// assert_eq!(b, [5_i8; 32]); /// ``` /// * **Intrinsic:** [`_mm256_broadcastb_epi8`] /// * **Assembly:** `vpbroadcastb ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn set_splat_i8_m128i_s_m256i(a: m128i) -> m256i { m256i(unsafe { _mm256_broadcastb_epi8(a.0) }) } /// Sets the lowest `i16` lane of an `m128i` as all lanes of an `m256i`. /// ``` /// # use safe_arch::*; /// let a = m128i::from(5_i16 as i128); /// let b: [i16; 16] = set_splat_i16_m128i_s_m256i(a).into(); /// assert_eq!(b, [5_i16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_broadcastw_epi16`] /// * **Assembly:** `vpbroadcastw ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn set_splat_i16_m128i_s_m256i(a: m128i) -> m256i { m256i(unsafe { _mm256_broadcastw_epi16(a.0) }) } /// Sets the lowest `i32` lane of an `m128i` as all lanes of an `m256i`. /// ``` /// # use safe_arch::*; /// let a = m128i::from(5_i32 as i128); /// let b: [i32; 8] = set_splat_i32_m128i_s_m256i(a).into(); /// assert_eq!(b, [5_i32; 8]); /// ``` /// * **Intrinsic:** [`_mm256_broadcastd_epi32`] /// * **Assembly:** `vpbroadcastd ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn set_splat_i32_m128i_s_m256i(a: m128i) -> m256i { m256i(unsafe { _mm256_broadcastd_epi32(a.0) }) } /// Sets the lowest `i64` lane of an `m128i` as all lanes of an `m256i`. /// ``` /// # use safe_arch::*; /// let a = m128i::from(5_i64 as i128); /// let b: [i64; 4] = set_splat_i64_m128i_s_m256i(a).into(); /// assert_eq!(b, [5_i64; 4]); /// ``` /// * **Intrinsic:** [`_mm256_broadcastq_epi64`] /// * **Assembly:** `vpbroadcastq ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn set_splat_i64_m128i_s_m256i(a: m128i) -> m256i { m256i(unsafe { _mm256_broadcastq_epi64(a.0) }) } /// Sets the lowest lane of an `m128d` as all lanes of an `m256d`. /// ``` /// # use safe_arch::*; /// let a = set_m128d_s(5.0); /// let b = set_splat_m128d_s_m256d(a).to_array(); /// assert_eq!(b, [5.0; 4]); /// ``` /// * **Intrinsic:** [`_mm256_broadcastsd_pd`] /// * **Assembly:** `vbroadcastsd ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn set_splat_m128d_s_m256d(a: m128d) -> m256d { m256d(unsafe { _mm256_broadcastsd_pd(a.0) }) } /// Sets the lowest lane of an `m128` as all lanes of an `m256`. /// ``` /// # use safe_arch::*; /// let a = set_m128_s(5.0); /// let b = set_splat_m128_s_m256(a).to_array(); /// assert_eq!(b, [5.0; 8]); /// ``` /// * **Intrinsic:** [`_mm256_broadcastss_ps`] /// * **Assembly:** `vbroadcastss ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn set_splat_m128_s_m256(a: m128) -> m256 { m256(unsafe { _mm256_broadcastss_ps(a.0) }) } /// Shifts each `u128` lane left by a number of **bytes**. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([0x0000000B_0000000A_0000000F_11111111_u128; 2]); /// // /// let b: [u128; 2] = byte_shl_imm_u128_m256i::<1>(a).into(); /// assert_eq!(b, [0x00000B00_00000A00_00000F11_11111100_u128; 2]); /// ``` /// * **Intrinsic:** [`_mm256_bslli_epi128`] /// * **Assembly:** `vpslldq ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn byte_shl_imm_u128_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_bslli_epi128(a.0, IMM) }) } /// Shifts each `u128` lane right by a number of **bytes**. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([0x0000000B_0000000A_0000000F_11111111_u128; 2]); /// // /// let b: [u128; 2] = byte_shr_imm_u128_m256i::<1>(a).into(); /// assert_eq!(b, [0x00000000_0B000000_0A000000_0F111111; 2]); /// ``` /// * **Intrinsic:** [`_mm256_bsrli_epi128`] /// * **Assembly:** `vpsrldq ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn byte_shr_imm_u128_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_bsrli_epi128(a.0, IMM) }) } /// Compare `i8` lanes for equality, mask output. /// ``` /// # use safe_arch::*; /// assert_eq!( /// <[i8; 32]>::from(cmp_eq_mask_i8_m256i(m256i::from([1_i8; 32]), m256i::from([1_i8; 32]))), /// [-1_i8; 32] /// ); /// assert_eq!( /// <[i8; 32]>::from(cmp_eq_mask_i8_m256i(m256i::from([5_i8; 32]), m256i::from([6_i8; 32]))), /// [0_i8; 32] /// ); /// ``` /// * **Intrinsic:** [`_mm256_cmpeq_epi8`] /// * **Assembly:** `vpcmpeqb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn cmp_eq_mask_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_cmpeq_epi8(a.0, b.0) }) } /// Compare `i16` lanes for equality, mask output. /// ``` /// # use safe_arch::*; /// assert_eq!( /// <[i16; 16]>::from(cmp_eq_mask_i16_m256i(m256i::from([1_i16; 16]), m256i::from([1_i16; 16]))), /// [-1_i16; 16] /// ); /// assert_eq!( /// <[i16; 16]>::from(cmp_eq_mask_i16_m256i(m256i::from([5_i16; 16]), m256i::from([6_i16; 16]))), /// [0_i16; 16] /// ); /// ``` /// * **Intrinsic:** [`_mm256_cmpeq_epi16`] /// * **Assembly:** `vpcmpeqw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn cmp_eq_mask_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_cmpeq_epi16(a.0, b.0) }) } /// Compare `i32` lanes for equality, mask output. /// ``` /// # use safe_arch::*; /// assert_eq!( /// <[i32; 8]>::from(cmp_eq_mask_i32_m256i(m256i::from([1_i32; 8]), m256i::from([1_i32; 8]))), /// [-1_i32; 8] /// ); /// assert_eq!( /// <[i32; 8]>::from(cmp_eq_mask_i32_m256i(m256i::from([5_i32; 8]), m256i::from([6_i32; 8]))), /// [0_i32; 8] /// ); /// ``` /// * **Intrinsic:** [`_mm256_cmpeq_epi32`] /// * **Assembly:** `vpcmpeqd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn cmp_eq_mask_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_cmpeq_epi32(a.0, b.0) }) } /// Compare `i64` lanes for equality, mask output. /// ``` /// # use safe_arch::*; /// assert_eq!( /// <[i64; 4]>::from(cmp_eq_mask_i64_m256i(m256i::from([1_i64; 4]), m256i::from([1_i64; 4]))), /// [-1_i64; 4] /// ); /// assert_eq!( /// <[i64; 4]>::from(cmp_eq_mask_i64_m256i(m256i::from([5_i64; 4]), m256i::from([6_i64; 4]))), /// [0_i64; 4] /// ); /// ``` /// * **Intrinsic:** [`_mm256_cmpeq_epi64`] /// * **Assembly:** `vpcmpeqq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn cmp_eq_mask_i64_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_cmpeq_epi64(a.0, b.0) }) } /// Compare `i8` lanes for `a > b`, mask output. /// ``` /// # use safe_arch::*; /// assert_eq!( /// <[i8; 32]>::from(cmp_gt_mask_i8_m256i(m256i::from([1_i8; 32]), m256i::from([0_i8; 32]))), /// [-1_i8; 32] /// ); /// assert_eq!( /// <[i8; 32]>::from(cmp_gt_mask_i8_m256i(m256i::from([5_i8; 32]), m256i::from([5_i8; 32]))), /// [0_i8; 32] /// ); /// ``` /// * **Intrinsic:** [`_mm256_cmpgt_epi8`] /// * **Assembly:** `vpcmpgtb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn cmp_gt_mask_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_cmpgt_epi8(a.0, b.0) }) } /// Compare `i16` lanes for `a > b`, mask output. /// ``` /// # use safe_arch::*; /// assert_eq!( /// <[i16; 16]>::from(cmp_gt_mask_i16_m256i(m256i::from([1_i16; 16]), m256i::from([0_i16; 16]))), /// [-1_i16; 16] /// ); /// assert_eq!( /// <[i16; 16]>::from(cmp_gt_mask_i16_m256i(m256i::from([5_i16; 16]), m256i::from([5_i16; 16]))), /// [0_i16; 16] /// ); /// ``` /// * **Intrinsic:** [`_mm256_cmpgt_epi16`] /// * **Assembly:** `vpcmpgtw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn cmp_gt_mask_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_cmpgt_epi16(a.0, b.0) }) } /// Compare `i32` lanes for `a > b`, mask output. /// ``` /// # use safe_arch::*; /// assert_eq!( /// <[i32; 8]>::from(cmp_gt_mask_i32_m256i(m256i::from([1_i32; 8]), m256i::from([0_i32; 8]))), /// [-1_i32; 8] /// ); /// assert_eq!( /// <[i32; 8]>::from(cmp_gt_mask_i32_m256i(m256i::from([5_i32; 8]), m256i::from([5_i32; 8]))), /// [0_i32; 8] /// ); /// ``` /// * **Intrinsic:** [`_mm256_cmpgt_epi32`] /// * **Assembly:** `vpcmpgtd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn cmp_gt_mask_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_cmpgt_epi32(a.0, b.0) }) } /// Compare `i64` lanes for `a > b`, mask output. /// ``` /// # use safe_arch::*; /// assert_eq!( /// <[i64; 4]>::from(cmp_gt_mask_i64_m256i(m256i::from([1_i64; 4]), m256i::from([0_i64; 4]))), /// [-1_i64; 4] /// ); /// assert_eq!( /// <[i64; 4]>::from(cmp_gt_mask_i64_m256i(m256i::from([5_i64; 4]), m256i::from([5_i64; 4]))), /// [0_i64; 4] /// ); /// ``` /// * **Intrinsic:** [`_mm256_cmpgt_epi64`] /// * **Assembly:** `vpcmpgtq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn cmp_gt_mask_i64_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_cmpgt_epi64(a.0, b.0) }) } /// Convert `i16` values to `i32` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([-5_i16; 8]); /// let b: [i32; 8] = convert_to_i32_m256i_from_i16_m128i(a).into(); /// assert_eq!(b, [-5_i32; 8]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepi16_epi32`] /// * **Assembly:** `vpmovsxwd ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i32_m256i_from_i16_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepi16_epi32(a.0) }) } /// Convert `i16` values to `i64` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([-5_i16; 8]); /// let b: [i64; 4] = convert_to_i64_m256i_from_lower4_i16_m128i(a).into(); /// assert_eq!(b, [-5_i64; 4]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepi16_epi64`] /// * **Assembly:** `vpmovsxwq ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i64_m256i_from_lower4_i16_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepi16_epi64(a.0) }) } /// Convert `i32` values to `i64` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([-5_i32; 4]); /// let b: [i64; 4] = convert_to_i64_m256i_from_i32_m128i(a).into(); /// assert_eq!(b, [-5_i64; 4]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepi32_epi64`] /// * **Assembly:** `vpmovsxdq ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i64_m256i_from_i32_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepi32_epi64(a.0) }) } /// Convert `i8` values to `i16` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([-5_i8; 16]); /// let b: [i16; 16] = convert_to_i16_m256i_from_i8_m128i(a).into(); /// assert_eq!(b, [-5_i16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepi8_epi16`] /// * **Assembly:** `vpmovsxbw ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i16_m256i_from_i8_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepi8_epi16(a.0) }) } /// Convert the lower 8 `i8` values to `i32` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([-5_i8; 16]); /// let b: [i32; 8] = convert_to_i32_m256i_from_lower8_i8_m128i(a).into(); /// assert_eq!(b, [-5_i32; 8]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepi8_epi32`] /// * **Assembly:** `vpmovsxbd ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i32_m256i_from_lower8_i8_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepi8_epi32(a.0) }) } /// Convert the lower 4 `i8` values to `i64` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([-5_i8; 16]); /// let b: [i64; 4] = convert_to_i64_m256i_from_lower4_i8_m128i(a).into(); /// assert_eq!(b, [-5_i64; 4]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepi8_epi64`] /// * **Assembly:** `vpmovsxbq ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i64_m256i_from_lower4_i8_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepi8_epi64(a.0) }) } /// Convert `u16` values to `i32` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([5_u16; 8]); /// let b: [i32; 8] = convert_to_i32_m256i_from_u16_m128i(a).into(); /// assert_eq!(b, [5_i32; 8]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepu16_epi32`] /// * **Assembly:** `vpmovzxwd ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i32_m256i_from_u16_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepu16_epi32(a.0) }) } /// Convert `u16` values to `i64` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([5_u16; 8]); /// let b: [i64; 4] = convert_to_i64_m256i_from_lower4_u16_m128i(a).into(); /// assert_eq!(b, [5_i64; 4]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepu16_epi64`] /// * **Assembly:** `vpmovzxwq ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i64_m256i_from_lower4_u16_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepu16_epi64(a.0) }) } /// Convert `u32` values to `i64` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([5_u32; 4]); /// let b: [i64; 4] = convert_to_i64_m256i_from_u32_m128i(a).into(); /// assert_eq!(b, [5_i64; 4]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepu32_epi64`] /// * **Assembly:** `vpmovzxdq ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i64_m256i_from_u32_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepu32_epi64(a.0) }) } /// Convert `u8` values to `i16` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([5_u8; 16]); /// let b: [i16; 16] = convert_to_i16_m256i_from_u8_m128i(a).into(); /// assert_eq!(b, [5_i16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepu8_epi16`] /// * **Assembly:** `vpmovzxbw ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i16_m256i_from_u8_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepu8_epi16(a.0) }) } /// Convert lower 8 `u8` values to `i16` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([5_u8; 16]); /// let b: [i32; 8] = convert_to_i16_m256i_from_lower8_u8_m128i(a).into(); /// assert_eq!(b, [5_i32; 8]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepu8_epi32`] /// * **Assembly:** `vpmovzxbd ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i16_m256i_from_lower8_u8_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepu8_epi32(a.0) }) } /// Convert lower 4 `u8` values to `i16` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([5_u8; 16]); /// let b: [i64; 4] = convert_to_i16_m256i_from_lower4_u8_m128i(a).into(); /// assert_eq!(b, [5_i64; 4]); /// ``` /// * **Intrinsic:** [`_mm256_cvtepu8_epi64`] /// * **Assembly:** `vpmovzxbq ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn convert_to_i16_m256i_from_lower4_u8_m128i(a: m128i) -> m256i { m256i(unsafe { _mm256_cvtepu8_epi64(a.0) }) } /// Gets an `i16` value out of an `m256i`, returns as `i32`. /// /// The lane to get must be a constant in the range `0..16`. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([0xA_i16, 0xB, 0xC, 0xD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); /// // /// assert_eq!(extract_i16_as_i32_m256i::<0>(a), 0xA); /// assert_eq!(extract_i16_as_i32_m256i::<1>(a), 0xB); /// ``` /// * **Intrinsic:** [`_mm256_extract_epi16`] #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn extract_i16_as_i32_m256i(a: m256i) -> i32 { unsafe { _mm256_extract_epi16(a.0, LANE) } } /// Gets an `i8` value out of an `m256i`, returns as `i32`. /// /// The lane to get must be a constant in the range `0..32`. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 0xA_i8, 0xB, 0xC, 0xD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /// 0, 0, 0, 0, /// ]); /// // /// assert_eq!(extract_i8_as_i32_m256i::<0>(a), 0xA); /// assert_eq!(extract_i8_as_i32_m256i::<1>(a), 0xB); /// ``` /// * **Intrinsic:** [`_mm256_extract_epi8`] #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn extract_i8_as_i32_m256i(a: m256i) -> i32 { unsafe { _mm256_extract_epi8(a.0, LANE) } } /// Gets an `m128i` value out of an `m256i`. /// /// The lane to get must be a constant 0 or 1. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_u128, 6_u128]); /// // /// assert_eq!(extract_m128i_m256i::<0>(a), m128i::from(5_u128)); /// assert_eq!(extract_m128i_m256i::<1>(a), m128i::from(6_u128)); /// ``` /// * **Intrinsic:** [`_mm256_extract_epi8`] /// * **Assembly:** `vextracti128 xmm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn extract_m128i_m256i(a: m256i) -> m128i { m128i(unsafe { _mm256_extracti128_si256(a.0, LANE) }) } /// Horizontal `a + b` with lanes as `i16`. /// /// * The results are interleaved 128-bits at a time: a.low, b.low, a.high, /// b.high /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16; 16]); /// let b = m256i::from([6_i16; 16]); /// let c: [i16; 16] = add_horizontal_i16_m256i(a, b).into(); /// assert_eq!(c, [10_i16, 10, 10, 10, 12, 12, 12, 12, 10, 10, 10, 10, 12, 12, 12, 12]); /// ``` /// * **Intrinsic:** [`_mm256_hadd_epi16`] /// * **Assembly:** `vphaddw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_horizontal_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_hadd_epi16(a.0, b.0) }) } /// Horizontal saturating `a + b` with lanes as `i16`. /// /// * The results are interleaved 128-bits at a time: a.low, b.low, a.high, /// b.high /// ``` /// # use safe_arch::*; /// let a = m256i::from([i16::MAX; 16]); /// let b = m256i::from([i16::MIN; 16]); /// let c: [i16; 16] = add_horizontal_saturating_i16_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// i16::MAX, i16::MAX, i16::MAX, i16::MAX, /// i16::MIN, i16::MIN, i16::MIN, i16::MIN, /// i16::MAX, i16::MAX, i16::MAX, i16::MAX, /// i16::MIN, i16::MIN, i16::MIN, i16::MIN, /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_hadds_epi16`] /// * **Assembly:** `vphaddsw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] #[rustfmt::skip] pub fn add_horizontal_saturating_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_hadds_epi16(a.0, b.0) }) } /// Horizontal `a + b` with lanes as `i32`. /// /// * The results are interleaved 128-bits at a time: a.low, b.low, a.high, /// b.high /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i32; 8]); /// let b = m256i::from([6_i32; 8]); /// let c: [i32; 8] = add_horizontal_i32_m256i(a, b).into(); /// assert_eq!(c, [10, 10, 12, 12, 10, 10, 12, 12]); /// ``` /// * **Intrinsic:** [`_mm256_hadd_epi32`] /// * **Assembly:** `vphaddd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn add_horizontal_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_hadd_epi32(a.0, b.0) }) } /// Horizontal `a - b` with lanes as `i16`. /// /// * The results are interleaved 128-bits at a time: a.low, b.low, a.high, /// b.high /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]); /// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, 8, 7, 6, 5, 234, 654, 123, 978]); /// let c: [i16; 16] = add_horizontal_i16_m256i(a, b).into(); /// assert_eq!(c, [11_i16, 7, 7, 1, 25000, -10, 1, 5, 1, 77, 15, 11, 15, 11, 888, 1101]); /// ``` /// * **Intrinsic:** [`_mm256_hsub_epi16`] /// * **Assembly:** `vphsubw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_horizontal_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_hsub_epi16(a.0, b.0) }) } /// Horizontal `a - b` with lanes as `i32`. /// /// * The results are interleaved 128-bits at a time: a.low, b.low, a.high, /// b.high /// ``` /// # use safe_arch::*; /// let a = m256i::from([5, 6, 2, 5, 4, 3, 1, 0]); /// let b = m256i::from([-12, 13, 56, 21, 8, 7, 6, 5]); /// let c: [i32; 8] = sub_horizontal_i32_m256i(a, b).into(); /// assert_eq!(c, [-1, -3, -25, 35, 1, 1, 1, 1]); /// ``` /// * **Intrinsic:** [`_mm256_hsub_epi32`] /// * **Assembly:** `vphsubd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_horizontal_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_hsub_epi32(a.0, b.0) }) } /// Horizontal saturating `a - b` with lanes as `i16`. /// /// * The results are interleaved 128-bits at a time: a.low, b.low, a.high, /// b.high /// ``` /// # use safe_arch::*; /// let a = m256i::from([i16::MAX; 16]); /// let b = m256i::from([i16::MIN; 16]); /// let c: [i16; 16] = add_horizontal_saturating_i16_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// i16::MAX, i16::MAX, i16::MAX, i16::MAX, /// i16::MIN, i16::MIN, i16::MIN, i16::MIN, /// i16::MAX, i16::MAX, i16::MAX, i16::MAX, /// i16::MIN, i16::MIN, i16::MIN, i16::MIN, /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_hsubs_epi16`] /// * **Assembly:** `vphsubsw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] #[rustfmt::skip] pub fn sub_horizontal_saturating_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_hsubs_epi16(a.0, b.0) }) } /// Multiply `i16` lanes producing `i32` values, horizontal add pairs of `i32` /// values to produce the final output. /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i16, 2, 3, 4, -1, -2, -3, -4, 12, 13, -14, -15, 100, 200, 300, -400]); /// let b = m256i::from([5_i16, 6, 7, 8, -15, -26, -37, 48, 50, 60, 70, -80, 90, 100, 12, -80]); /// let c: [i32; 8] = mul_i16_horizontal_add_m256i(a, b).into(); /// assert_eq!(c, [17, 53, 67, -81, 1380, 220, 29000, 35600]); /// ``` /// * **Intrinsic:** [`_mm256_madd_epi16`] /// * **Assembly:** `vpmaddwd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn mul_i16_horizontal_add_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_madd_epi16(a.0, b.0) }) } /// This is dumb and weird. /// /// * Vertically multiplies each `u8` lane from `a` with an `i8` lane from `b`, /// producing an `i16` intermediate value. /// * These intermediate `i16` values are horizontally added with saturation. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 255_u8, 255, 0, 0, 255, 255, 1, 1, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, /// 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, /// ]); /// let b = m256i::from([ /// 127_i8, 127, 0, 0, -127, -127, 1, 1, 24, 25, 26, 27, 28, 29, 30, 31, 16, /// 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, /// ]); /// let c: [i16; 16] = mul_u8i8_add_horizontal_saturating_m256i(a, b).into(); /// assert_eq!( /// c, /// [i16::MAX, 0, i16::MIN, 2, 417, 557, 713, 885, /// 545, 685, 841, 1013, 1201, 1405, 1625, 1861] /// ); /// ``` /// * **Intrinsic:** [`_mm256_maddubs_epi16`] /// * **Assembly:** `vpmaddubsw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] #[rustfmt::skip] pub fn mul_u8i8_add_horizontal_saturating_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_maddubs_epi16(a.0, b.0) }) } /// Loads the reference given and zeroes any `i32` lanes not in the mask. /// /// * A lane is "in" the mask if that lane's mask value is set in the high bit /// (aka "if the lane's value is negative"). /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i32; 8]); /// let b = load_masked_i32_m256i(&a, m256i::from([-1_i32, 0, 0, -1, -1, -1, 0, 0])); /// assert_eq!(<[i32; 8]>::from(b), [5, 0, 0, 5, 5, 5, 0, 0]); /// ``` /// * **Intrinsic:** [`_mm256_maskload_epi32`] /// * **Assembly:** `vpmaskmovd ymm, ymm, m256` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn load_masked_i32_m256i(a: &m256i, mask: m256i) -> m256i { m256i(unsafe { _mm256_maskload_epi32(a as *const m256i as *const i32, mask.0) }) } /// Loads the reference given and zeroes any `i64` lanes not in the mask. /// /// * A lane is "in" the mask if that lane's mask value is set in the high bit /// (aka "if the lane's value is negative"). /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i64; 4]); /// let b = load_masked_i64_m256i(&a, m256i::from([0_i64, -1, -1, 0])); /// assert_eq!(<[i64; 4]>::from(b), [0_i64, 5, 5, 0]); /// ``` /// * **Intrinsic:** [`_mm256_maskload_epi64`] /// * **Assembly:** `vpmaskmovq ymm, ymm, m256` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn load_masked_i64_m256i(a: &m256i, mask: m256i) -> m256i { m256i(unsafe { _mm256_maskload_epi64(a as *const m256i as *const i64, mask.0) }) } /// Stores the `i32` masked lanes given to the reference. /// /// * A lane is "in" the mask if that lane's mask value is set in the high bit /// (aka "if the lane's value is negative"). /// * Lanes not in the mask are not modified. /// ``` /// # use safe_arch::*; /// let mut a = m256i::default(); /// store_masked_i32_m256i( /// &mut a, /// m256i::from([-1_i32, 0, 0, -1, -1, -1, 0, 0]), /// m256i::from([5_i32; 8]), /// ); /// assert_eq!(<[i32; 8]>::from(a), [5, 0, 0, 5, 5, 5, 0, 0]); /// ``` /// * **Intrinsic:** [`_mm256_maskstore_epi32`] /// * **Assembly:** `vpmaskmovd m256, ymm, ymm` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn store_masked_i32_m256i(addr: &mut m256i, mask: m256i, a: m256i) { unsafe { _mm256_maskstore_epi32(addr as *mut m256i as *mut i32, mask.0, a.0) }; } /// Stores the `i32` masked lanes given to the reference. /// /// * A lane is "in" the mask if that lane's mask value is set in the high bit /// (aka "if the lane's value is negative"). /// * Lanes not in the mask are not modified. /// ``` /// # use safe_arch::*; /// let mut a = m256i::default(); /// store_masked_i64_m256i(&mut a, m256i::from([0_i64, -1, -1, 0]), m256i::from([5_i64; 4])); /// assert_eq!(<[i64; 4]>::from(a), [0, 5, 5, 0]); /// ``` /// * **Intrinsic:** [`_mm256_maskstore_epi64`] /// * **Assembly:** `vpmaskmovq m256, ymm, ymm` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn store_masked_i64_m256i(addr: &mut m256i, mask: m256i, a: m256i) { unsafe { _mm256_maskstore_epi64(addr as *mut m256i as *mut i64, mask.0, a.0) }; } /// Inserts an `m128i` to an `m256i` at the high or low position. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i32; 8]); /// let b: [i32; 8] = insert_m128i_to_m256i::<1>(a, m128i::from([1, 2, 3, 4])).into(); /// assert_eq!(b, [0, 0, 0, 0, 1, 2, 3, 4]); /// ``` /// * **Intrinsic:** [`_mm256_inserti128_si256`] /// * **Assembly:** `vinserti128 ymm, ymm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn insert_m128i_to_m256i(a: m256i, b: m128i) -> m256i { m256i(unsafe { _mm256_inserti128_si256(a.0, b.0, LANE) }) } /// Lanewise `max(a, b)` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 1, 3, 5, 7, 2, 3, 5, 12, 13, 16, 27, /// 28, 29, 30, 31, 32, /// ]); /// let b = m256i::from([ /// 0_i8, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, 24, 127, 0, -1, 3, 4, 5, 1, -2, -4, /// -8, 12, 13, 14, 29, 30, -31, -32, /// ]); /// let c: [i8; 32] = max_i8_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// 0, 11, 2, 3, 4, 15, 6, 7, 8, 19, 10, 21, 22, 13, 24, 127, 1, 3, 5, 7, 5, 3, 5, 12, 13, 16, /// 27, 28, 29, 30, 31, 32 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_max_epi8`] /// * **Assembly:** `vpmaxsb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn max_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_max_epi8(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]); /// let b = m256i::from([0_i16, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, -24, 25]); /// let c: [i16; 16] = max_i16_m256i(a, b).into(); /// assert_eq!(c, [0, 11, 2, 3, 4, 15, 6, 7, 8, 19, 10, 21, 22, 13, 14, 127]); /// ``` /// * **Intrinsic:** [`_mm256_max_epi16`] /// * **Assembly:** `vpmaxsw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn max_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_max_epi16(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `i32`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]); /// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]); /// let c: [i32; 8] = max_i32_m256i(a, b).into(); /// assert_eq!(c, [0, 11, 2, 3, 4, 15, 6, 7]); /// ``` /// * **Intrinsic:** [`_mm256_max_epi32`] /// * **Assembly:** `vpmaxsd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn max_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_max_epi32(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `u8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 1, 3, 5, 7, 2, 3, 5, 12, 13, 16, 27, /// 28, 29, 30, 31, 32, /// ]); /// let b = m256i::from([ /// 0_u8, 255, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 0, 1, 3, 4, 5, 1, 2, 4, 8, 12, /// 13, 14, 29, 30, 31, 32, /// ]); /// let c: [u8; 32] = max_u8_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// 0, 255, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 1, 3, 5, 7, 5, 3, 5, 12, 13, 16, /// 27, 28, 29, 30, 31, 32 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_max_epu8`] /// * **Assembly:** `vpmaxub ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn max_u8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_max_epu8(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `u16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]); /// let b = m256i::from([0_u16, 65535, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 25]); /// let c: [u16; 16] = max_u16_m256i(a, b).into(); /// assert_eq!(c, [0, 65535, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127]); /// ``` /// * **Intrinsic:** [`_mm256_max_epu16`] /// * **Assembly:** `vpmaxuw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn max_u16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_max_epu16(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `u32`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u32, 1, 2, 3, 4, 5, 6, 7]); /// let b = m256i::from([0_u32, 11, 2, 13, 4, 15, 6, 17]); /// let c: [u32; 8] = max_u32_m256i(a, b).into(); /// assert_eq!(c, [0, 11, 2, 13, 4, 15, 6, 17]); /// ``` /// * **Intrinsic:** [`_mm256_max_epu32`] /// * **Assembly:** `vpmaxud ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn max_u32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_max_epu32(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 1, 3, 5, 7, 2, 3, 5, 12, 13, 16, 27, /// 28, 29, 30, 31, 32, /// ]); /// let b = m256i::from([ /// 0_i8, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, 24, 127, 0, -1, 3, 4, 5, 1, -2, -4, /// -8, 12, 13, 14, 29, 30, -31, -32, /// ]); /// let c: [i8; 32] = min_i8_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// 0, 1, 2, -13, 4, 5, 6, -17, -8, 9, -20, 11, 12, -23, 14, 127, 0, -1, 3, 4, 2, 1, -2, -4, -8, /// 12, 13, 14, 29, 30, -31, -32 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_min_epi8`] /// * **Assembly:** `vpminsb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn min_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_min_epi8(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]); /// let b = m256i::from([0_i16, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, -24, 25]); /// let c: [i16; 16] = min_i16_m256i(a, b).into(); /// assert_eq!(c, [0, 1, 2, -13, 4, 5, 6, -17, -8, 9, -20, 11, 12, -23, -24, 25]); /// ``` /// * **Intrinsic:** [`_mm256_min_epi16`] /// * **Assembly:** `vpminsw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn min_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_min_epi16(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `i32`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]); /// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]); /// let c: [i32; 8] = min_i32_m256i(a, b).into(); /// assert_eq!(c, [0, 1, 2, -13, 4, 5, 6, -17]); /// ``` /// * **Intrinsic:** [`_mm256_min_epi32`] /// * **Assembly:** `vpminsd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn min_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_min_epi32(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `u8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 1, 3, 5, 7, 2, 3, 5, 12, 13, 16, 27, /// 28, 29, 30, 31, 32, /// ]); /// let b = m256i::from([ /// 0_u8, 255, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 0, 1, 3, 4, 5, 1, 2, 4, 8, 12, /// 13, 14, 29, 30, 31, 32, /// ]); /// let c: [u8; 32] = min_u8_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127, 0, 1, 3, 4, 2, 1, 2, 4, 8, 12, 13, 14, /// 29, 30, 31, 32 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_min_epu8`] /// * **Assembly:** `vpminub ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn min_u8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_min_epu8(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `u16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]); /// let b = m256i::from([0_u16, 65535, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 25]); /// let c: [u16; 16] = min_u16_m256i(a, b).into(); /// assert_eq!(c, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 25]); /// ``` /// * **Intrinsic:** [`_mm256_min_epu16`] /// * **Assembly:** `vpminuw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn min_u16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_min_epu16(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `u32`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u32, 1, 2, 3, 4, 5, 6, 7]); /// let b = m256i::from([0_u32, 11, 2, 13, 4, 15, 6, 17]); /// let c: [u32; 8] = min_u32_m256i(a, b).into(); /// assert_eq!(c, [0, 1, 2, 3, 4, 5, 6, 7]); /// ``` /// * **Intrinsic:** [`_mm256_min_epu32`] /// * **Assembly:** `vpminud ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn min_u32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_min_epu32(a.0, b.0) }) } /// Create an `i32` mask of each sign bit in the `i8` lanes. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 0_i8, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, 24, 127, 0, -1, 3, 4, 5, 1, -2, -4, /// -8, 12, 13, 14, 29, 30, -31, 32, /// ]); /// assert_eq!(0b01000001110000100010010110001000, move_mask_i8_m256i(a)); /// ``` /// * **Intrinsic:** [`_mm256_movemask_epi8`] /// * **Assembly:** `vpmovmskb r32, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn move_mask_i8_m256i(a: m256i) -> i32 { unsafe { _mm256_movemask_epi8(a.0) } } /// Computes eight `u16` "sum of absolute difference" values according to the /// bytes selected. /// /// * This essentially works like two [`multi_packed_sum_abs_diff_u8_m128i`] /// uses happening at once, the "low" portion works on the lower 128 bits, and /// the "high" portion works on the upper 128 bits. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_u8; 32]); /// let b = m256i::from([7_u8; 32]); /// // /// let c: [u16; 16] = multi_packed_sum_abs_diff_u8_m256i::<0b101000>(a, b).into(); /// assert_eq!(c, [8_u16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_mpsadbw_epu8`] /// * **Assembly:** `` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn multi_packed_sum_abs_diff_u8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_mpsadbw_epu8(a.0, b.0, IMM) }) } /// Multiply the lower `i32` within each `i64` lane, `i64` output. /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i64, 2, 3, 4]); /// let b = m256i::from([5_i64, 6, 7, -8]); /// let c: [i64; 4] = mul_i64_low_bits_m256i(a, b).into(); /// assert_eq!(c, [5_i64, 12, 21, -32]); /// ``` /// * **Intrinsic:** [`_mm256_mul_epi32`] /// * **Assembly:** `vpmuldq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn mul_i64_low_bits_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_mul_epi32(a.0, b.0) }) } /// Multiply the lower `u32` within each `u64` lane, `u64` output. /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_u64, 2, 3, 4]); /// let b = m256i::from([5_u64, 6, 7, 8]); /// let c: [u64; 4] = mul_u64_low_bits_m256i(a, b).into(); /// assert_eq!(c, [5_u64, 12, 21, 32]); /// ``` /// * **Intrinsic:** [`_mm256_mul_epu32`] /// * **Assembly:** `vpmuludq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn mul_u64_low_bits_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_mul_epu32(a.0, b.0) }) } /// Multiply the `i16` lanes and keep the high half of each 32-bit output. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]); /// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, 8, 7, 6, 5, 234, 654, 123, 978]); /// let c: [i16; 16] = mul_i16_keep_high_m256i(a, b).into(); /// assert_eq!(c, [0_i16, 1, -1, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0]); /// ``` /// * **Intrinsic:** [`_mm256_mulhi_epi16`] /// * **Assembly:** `vpmulhw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn mul_i16_keep_high_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_mulhi_epi16(a.0, b.0) }) } /// Multiply the `u16` lanes and keep the high half of each 32-bit output. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_u16, 6, 2, 5, 4, 3, 1, 0, 12000, 13, 56, 21, 8, 7, 6, 5]); /// let b = m256i::from([12000_u16, 13000, 2000, 800, 0, 1, 2, 3, 8, 7, 6, 5, 234, 654, 123, 978]); /// let c: [u16; 16] = mul_u16_keep_high_m256i(a, b).into(); /// assert_eq!(c, [0_u16, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]); /// ``` /// * **Intrinsic:** [`_mm256_mulhi_epu16`] /// * **Assembly:** `vpmulhuw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn mul_u16_keep_high_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_mulhi_epu16(a.0, b.0) }) } /// Multiply `i16` lanes into `i32` intermediates, keep the high 18 bits, round /// by adding 1, right shift by 1. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 0_i16, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, /// ]); /// let b = m256i::from([ /// 800_i16, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, /// 2300, /// ]); /// let c: [i16; 16] = mul_i16_scale_round_m256i(a, b).into(); /// assert_eq!(c, [0_i16, 3, 6, 10, 15, 20, 26, 32, 39, 47, 55, 64, 73, 83, 94, 105]); /// ``` /// * **Intrinsic:** [`_mm256_mulhrs_epi16`] /// * **Assembly:** `vpmulhrsw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn mul_i16_scale_round_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_mulhrs_epi16(a.0, b.0) }) } /// Multiply the `i16` lanes and keep the low half of each 32-bit output. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]); /// let b = m256i::from([-1_i16, 13000, -2, -8, 0, 1, 2, 3, 8, 7, 6, 5, 234, 654, 123, 978]); /// let c: [i16; 16] = mul_i16_keep_low_m256i(a, b).into(); /// assert_eq!(c, [-5, 12464, -4, -40, 0, 3, 2, 0, -96, 91, 336, 105, 1872, 4578, 738, 4890]); /// ``` /// * **Intrinsic:** [`_mm256_mullo_epi16`] /// * **Assembly:** `vpmullw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn mul_i16_keep_low_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_mullo_epi16(a.0, b.0) }) } /// Multiply the `i32` lanes and keep the low half of each 64-bit output. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]); /// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]); /// let c: [i32; 8] = mul_i32_keep_low_m256i(a, b).into(); /// assert_eq!(c, [0, 11, 4, -39, 16, 75, 36, -119]); /// ``` /// * **Intrinsic:** [`_mm256_mullo_epi32`] /// * **Assembly:** `vpmulld ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn mul_i32_keep_low_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_mullo_epi32(a.0, b.0) }) } /// Bitwise `a | b` /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i64, 0, 1, 1]); /// let b = m256i::from([0_i64, 1, 0, 1]); /// let c: [i64; 4] = bitor_m256i(a, b).into(); /// assert_eq!(c, [0_i64, 1, 1, 1]); /// ``` /// * **Intrinsic:** [`_mm256_or_si256`] /// * **Assembly:** `vpor ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn bitor_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_or_si256(a.0, b.0) }) } /// Saturating convert `i16` to `i8`, and pack the values. /// /// * The values are packed 128 bits at a time: `a_low`, `b_low`, `a_high`, /// `b_high` /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); /// let b = m256i::from([17_i16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]); /// let c: [i8; 32] = pack_i16_to_i8_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// 1_i8, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16, 25, /// 26, 27, 28, 29, 30, 31, 32 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_packs_epi16`] /// * **Assembly:** `vpacksswb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn pack_i16_to_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_packs_epi16(a.0, b.0) }) } /// Saturating convert `i32` to `i16`, and pack the values. /// /// * The values are packed 128 bits at a time: `a_low`, `b_low`, `a_high`, /// `b_high` /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i32, 2, 3, 4, 5, 6, 7, 8]); /// let b = m256i::from([9_i32, 10, 11, 12, 13, 14, 15, 16]); /// let c: [i16; 16] = pack_i32_to_i16_m256i(a, b).into(); /// assert_eq!(c, [1_i16, 2, 3, 4, 9, 10, 11, 12, 5, 6, 7, 8, 13, 14, 15, 16]); /// ``` /// * **Intrinsic:** [`_mm256_packs_epi32`] /// * **Assembly:** `vpackssdw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn pack_i32_to_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_packs_epi32(a.0, b.0) }) } /// Saturating convert `i16` to `u8`, and pack the values. /// /// * The values are packed 128 bits at a time: `a_low`, `b_low`, `a_high`, /// `b_high` /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); /// let b = m256i::from([17_i16, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]); /// let c: [u8; 32] = pack_i16_to_u8_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// 1_u8, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 9, 10, 11, 12, 13, 14, 15, 16, 25, /// 26, 27, 28, 29, 30, 31, 32 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_packus_epi16`] /// * **Assembly:** `vpackuswb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn pack_i16_to_u8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_packus_epi16(a.0, b.0) }) } /// Saturating convert `i32` to `u16`, and pack the values. /// /// * The values are packed 128 bits at a time: `a_low`, `b_low`, `a_high`, /// `b_high` /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i32, 2, 3, 4, 5, 6, 7, 8]); /// let b = m256i::from([9_i32, 10, 11, 12, 13, 14, 15, 16]); /// let c: [u16; 16] = pack_i32_to_u16_m256i(a, b).into(); /// assert_eq!(c, [1_u16, 2, 3, 4, 9, 10, 11, 12, 5, 6, 7, 8, 13, 14, 15, 16]); /// ``` /// * **Intrinsic:** [`_mm256_packus_epi32`] /// * **Assembly:** `vpackusdw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn pack_i32_to_u16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_packs_epi32(a.0, b.0) }) } /// Shuffle 128 bits of integer data from `$a` and `$b` using an immediate /// control value. /// /// You can pass `A_Low`, `A_High`, `B_Low`, `B_High`, or `Zeroed`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([1, 2, 3, 4, 5, 6, 7, 8]); /// let b = m256i::from([9, 10, 11, 12, 13, 14, 15, 16]); /// // /// let c: [i32; 8] = shuffle_abi_i128z_all_m256i::<0b_1000_0010>(a, b).into(); /// assert_eq!(c, [9, 10, 11, 12, 0, 0, 0, 0]); /// // /// let c: [i32; 8] = shuffle_abi_i128z_all_m256i::<0b_0001_1000>(a, b).into(); /// assert_eq!(c, [0, 0, 0, 0, 5, 6, 7, 8]); /// ``` /// * **Intrinsic:** [`_mm256_permute2x128_si256`] /// * **Assembly:** `vperm2i128 ymm, ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shuffle_abi_i128z_all_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_permute2x128_si256(a.0, b.0, MASK) }) } /// Shuffle the `f64` lanes in `$a` using an immediate control value. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i64, 6, 7, 8]); /// let b: [i64; 4] = shuffle_ai_i64_all_m256i::<0b00_01_10_11>(a).into(); /// assert_eq!(b, [8_i64, 7, 6, 5]); /// ``` /// * **Intrinsic:** [`_mm256_permute4x64_epi64`] /// * **Assembly:** `vpermq ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shuffle_ai_i64_all_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_permute4x64_epi64(a.0, IMM) }) } /// Shuffle the `f64` lanes from `$a` using an immediate control value. /// ``` /// # use safe_arch::*; /// let a = m256d::from_array([5.0, 6.0, 7.0, 8.0]); /// let b: [f64; 4] = shuffle_ai_f64_all_m256d::<0b00_01_10_11>(a).to_array(); /// assert_eq!(b, [8.0, 7.0, 6.0, 5.0]); /// ``` /// * **Intrinsic:** [`_mm256_permute4x64_pd`] /// * **Assembly:** `vpermpd ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shuffle_ai_f64_all_m256d(a: m256d) -> m256d { m256d(unsafe { _mm256_permute4x64_pd(a.0, IMM) }) } /// Shuffle `i32` lanes in `a` using `i32` values in `v`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([8, 9, 10, 11, 12, 13, 14, 15]); /// let v = m256i::from([7, 6, 5, 5, 3, 2, 2, 0]); /// let c: [i32; 8] = shuffle_av_i32_all_m256i(a, v).into(); /// assert_eq!(c, [15, 14, 13, 13, 11, 10, 10, 8]); /// ``` /// * **Intrinsic:** [`_mm256_permutevar8x32_epi32`] /// * **Assembly:** `vpermd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shuffle_av_i32_all_m256i(a: m256i, v: m256i) -> m256i { m256i(unsafe { _mm256_permutevar8x32_epi32(a.0, v.0) }) } /// Shuffle `f32` lanes in `a` using `i32` values in `v`. /// ``` /// # use safe_arch::*; /// let a = m256::from_array([8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]); /// let v = m256i::from([7, 6, 5, 5, 3, 2, 2, 0]); /// let c: [f32; 8] = shuffle_av_i32_all_m256(a, v).to_array(); /// assert_eq!(c, [15.0, 14.0, 13.0, 13.0, 11.0, 10.0, 10.0, 8.0]); /// ``` /// * **Intrinsic:** [`_mm256_permutevar8x32_ps`] /// * **Assembly:** `vpermps ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shuffle_av_i32_all_m256(a: m256, v: m256i) -> m256 { m256(unsafe { _mm256_permutevar8x32_ps(a.0, v.0) }) } /// Compute "sum of `u8` absolute differences". /// /// * `u8` lanewise `abs(a - b)`, producing `u8` intermediate values. /// * Sum the first eight and second eight values. /// * Place into the low 16 bits of four `u64` lanes. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 0_u8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 0, 11, 2, 13, 4, 15, 6, 17, 8, /// 19, 20, 21, 22, 23, 24, 127, /// ]); /// let b = m256i::from([ /// 20_u8, 110, 250, 103, 34, 105, 60, 217, 8, 19, 210, 201, 202, 203, 204, 127, 2, 3, 4, 5, 6, 7, /// 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, /// ]); /// let c: [u64; 4] = sum_of_u8_abs_diff_m256i(a, b).into(); /// assert_eq!(c, [831_u64, 910, 40, 160]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sum_of_u8_abs_diff_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_sad_epu8(a.0, b.0) }) } /// Shuffle the `i32` lanes in `a` using an immediate control value. /// /// Each lane selection value picks only within that 128-bit half of the overall /// register. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5, 6, 7, 8, 9, 10, 11, 12]); /// let b: [i32; 8] = shuffle_ai_i32_half_m256i::<0b00_01_10_11>(a).into(); /// assert_eq!(b, [8, 7, 6, 5, 12, 11, 10, 9]); /// ``` /// * **Intrinsic:** [`_mm256_shuffle_epi32`] /// * **Assembly:** `vpshufd ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shuffle_ai_i32_half_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_shuffle_epi32(a.0, IMM) }) } /// Shuffle `i8` lanes in `a` using `i8` values in `v`. /// /// Each lane selection value picks only within that 128-bit half of the overall /// register. /// /// If a lane in `v` is negative, that output is zeroed. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 3_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 7, 11, 2, 13, 4, 15, 6, 17, 8, /// 19, 20, 21, 22, 23, 24, 127, /// ]); /// let b = m256i::from([ /// -1_i8, 1, 0, 2, 2, 3, 4, 5, 6, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 12, 11, 10, 9, /// 8, 7, 6, 5, 4, /// ]); /// let c: [i8; 32] = shuffle_av_i8z_half_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// 0, 11, 3, 2, 2, 13, 4, 15, 6, 6, 17, 8, 8, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 22, 21, /// 20, 19, 8, 17, 6, 15, 4 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_shuffle_epi8`] /// * **Assembly:** `vpshufb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shuffle_av_i8z_half_m256i(a: m256i, v: m256i) -> m256i { m256i(unsafe { _mm256_shuffle_epi8(a.0, v.0) }) } /// Shuffle the high `i16` lanes in `$a` using an immediate control value. /// /// The lower 128 bits and upper 128 bits have this performed separately. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b: [i16; 16] = shuffle_ai_i16_h64half_m256i::<0b_00_01_10_11>(a).into(); /// assert_eq!(b, [0, 1, 2, 3, 7, 6, 5, 4, 8, 9, 10, 11, 15, 14, 13, 12]); /// ``` /// * **Intrinsic:** [`_mm256_shufflehi_epi16`] /// * **Assembly:** `vpshufhw ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shuffle_ai_i16_h64half_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_shufflehi_epi16(a.0, IMM) }) } /// Shuffle the low `i16` lanes in `$a` using an immediate control value. /// /// The lower 128 bits and upper 128 bits have this performed separately. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b: [i16; 16] = shuffle_ai_i16_l64half_m256i::<0b00_01_10_11>(a).into(); /// assert_eq!(b, [3, 2, 1, 0, 4, 5, 6, 7, 11, 10, 9, 8, 12, 13, 14, 15]); /// ``` /// * **Intrinsic:** [`_mm256_shufflelo_epi16`] /// * **Assembly:** `vpshuflw ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shuffle_ai_i16_l64half_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_shufflelo_epi16(a.0, IMM) }) } /// Lanewise `a * signum(b)` with lanes as `i8` /// /// * If `b` is positive, the output is `a`. /// * If `b` is zero, the output is 0. /// * If `b` is negative, the output is `-a`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 3_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 7, 11, 2, 13, 4, 15, 6, 17, 8, /// 19, 20, 21, 22, 23, 24, 127, /// ]); /// let b = m256i::from([ /// -1_i8, -1, 0, 2, 2, 3, 0, 5, 6, 6, -7, 8, 8, 0, 0, 10, 10, -11, 11, 12, 12, 13, 13, 12, 11, /// -10, 9, 8, 7, 6, 5, -4, /// ]); /// let c: [i8; 32] = sign_apply_i8_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// -3, -11, 0, 13, 4, 15, 0, 17, 8, 19, -20, 21, 22, 0, 0, 127, 7, -11, 2, 13, 4, 15, 6, 17, 8, /// -19, 20, 21, 22, 23, 24, -127 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_sign_epi8`] /// * **Assembly:** `vpsignb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sign_apply_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_sign_epi8(a.0, b.0) }) } /// Lanewise `a * signum(b)` with lanes as `i16` /// /// * If `b` is positive, the output is `a`. /// * If `b` is zero, the output is 0. /// * If `b` is negative, the output is `-a`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]); /// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, -8, -7, 6, 5, 0, 0, 0, 978]); /// let c: [i16; 16] = sign_apply_i16_m256i(a, b).into(); /// assert_eq!(c, [5, 6, -2, -5, 0, 3, 1, 0, 12, -13, 56, 21, 0, 0, 0, 5]); /// ``` /// * **Intrinsic:** [`_mm256_sign_epi16`] /// * **Assembly:** `vpsignw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sign_apply_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_sign_epi16(a.0, b.0) }) } /// Lanewise `a * signum(b)` with lanes as `i32` /// /// * If `b` is positive, the output is `a`. /// * If `b` is zero, the output is 0. /// * If `b` is negative, the output is `-a`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]); /// let b = m256i::from([0_i32, 0, -2, -13, 4, 15, 6, -17]); /// let c: [i32; 8] = sign_apply_i32_m256i(a, b).into(); /// assert_eq!(c, [0_i32, 0, -2, -3, 4, 5, 6, -7]); /// ``` /// * **Intrinsic:** [`_mm256_sign_epi32`] /// * **Assembly:** `vpsignd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sign_apply_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_sign_epi32(a.0, b.0) }) } /// Lanewise `u16` shift left by the lower `u64` lane of `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_u16, 6, 2, 5, 4, 3, 1, 0, 12, 13, 56, 21, 8, 7, 6, 5]); /// let count = m128i::from(1_u128); /// let b: [u16; 16] = shl_all_u16_m256i(a, count).into(); /// assert_eq!(b, [10, 12, 4, 10, 8, 6, 2, 0, 24, 26, 112, 42, 16, 14, 12, 10]); /// ``` /// * **Intrinsic:** [`_mm256_sll_epi16`] /// * **Assembly:** `vpsllw ymm, ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_all_u16_m256i(a: m256i, count: m128i) -> m256i { m256i(unsafe { _mm256_sll_epi16(a.0, count.0) }) } /// Shift all `u32` lanes left by the lower `u64` lane of `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u32, 1, 2, 13, 4, 15, 6, 17]); /// let count = m128i::from(1_u128); /// let b: [u32; 8] = shl_all_u32_m256i(a, count).into(); /// assert_eq!(b, [0, 2, 4, 26, 8, 30, 12, 34]); /// ``` /// * **Intrinsic:** [`_mm256_sll_epi32`] /// * **Assembly:** `vpslld ymm, ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_all_u32_m256i(a: m256i, count: m128i) -> m256i { m256i(unsafe { _mm256_sll_epi32(a.0, count.0) }) } /// Shift all `u64` lanes left by the lower `u64` lane of `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u64, 1, 2, 13]); /// let count = m128i::from(1_u128); /// let b: [u64; 4] = shl_all_u64_m256i(a, count).into(); /// assert_eq!(b, [0, 2, 4, 26]); /// ``` /// * **Intrinsic:** [`_mm256_sll_epi64`] /// * **Assembly:** `vpsllq ymm, ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_all_u64_m256i(a: m256i, count: m128i) -> m256i { m256i(unsafe { _mm256_sll_epi64(a.0, count.0) }) } /// Shifts all `u16` lanes left by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_u16, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]); /// let c: [u16; 16] = shl_imm_u16_m256i::<1>(a).into(); /// assert_eq!(c, [2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8]); /// ``` /// * **Intrinsic:** [`_mm256_slli_epi16`] /// * **Assembly:** `vpsllw ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_imm_u16_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_slli_epi16(a.0, IMM) }) } /// Shifts all `u32` lanes left by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_u32, 2, 3, 4, 1, 2, 3, 4]); /// let c: [u32; 8] = shl_imm_u32_m256i::<1>(a).into(); /// assert_eq!(c, [1_u32 << 1, 2 << 1, 3 << 1, 4 << 1, 1 << 1, 2 << 1, 3 << 1, 4 << 1]); /// ``` /// * **Intrinsic:** [`_mm256_slli_epi32`] /// * **Assembly:** `vpslld ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_imm_u32_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_slli_epi32(a.0, IMM) }) } /// Shifts all `u64` lanes left by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_u64, 2, 3, 4]); /// let c: [u64; 4] = shl_imm_u64_m256i::<1>(a).into(); /// assert_eq!(c, [1_u64 << 1, 2 << 1, 3 << 1, 4 << 1,]); /// ``` /// * **Intrinsic:** [`_mm256_slli_epi64`] /// * **Assembly:** `vpsllq ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_imm_u64_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_slli_epi64(a.0, IMM) }) } /// Lanewise `u32` shift left by the matching `i32` lane in `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u32, 1, 2, 13, 5, 6, 7, 1]); /// let count = m256i::from([1_u32, 2, 3, 4, 5, 6, 7, 1]); /// let b: [u32; 8] = shl_each_u32_m256i(a, count).into(); /// assert_eq!(b, [0, 4, 16, 208, 160, 384, 896, 2]); /// ``` /// * **Intrinsic:** [`_mm256_sllv_epi32`] /// * **Assembly:** `vpsllvd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_each_u32_m256i(a: m256i, count: m256i) -> m256i { m256i(unsafe { _mm256_sllv_epi32(a.0, count.0) }) } /// Lanewise `u64` shift left by the matching `u64` lane in `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u64, 1, 2, 13]); /// let count = m256i::from([1_u64, 2, 3, 4]); /// let b: [u64; 4] = shl_each_u64_m256i(a, count).into(); /// assert_eq!(b, [0, 4, 16, 208]); /// ``` /// * **Intrinsic:** [`_mm256_sllv_epi64`] /// * **Assembly:** `vpsllvq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shl_each_u64_m256i(a: m256i, count: m256i) -> m256i { m256i(unsafe { _mm256_sllv_epi64(a.0, count.0) }) } /// Lanewise `i16` shift right by the lower `i64` lane of `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]); /// let count = m128i::from(1_i128); /// let b: [i16; 16] = shr_all_i16_m256i(a, count).into(); /// assert_eq!(b, [2, 3, 1, 2, 2, 1, 0, 0, -6, 6, 28, 10, 4, 3, 3, 2]); /// ``` /// * **Intrinsic:** [`_mm256_sra_epi16`] /// * **Assembly:** `vpsraw ymm, ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_all_i16_m256i(a: m256i, count: m128i) -> m256i { m256i(unsafe { _mm256_sra_epi16(a.0, count.0) }) } /// Lanewise `i32` shift right by the lower `i64` lane of `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i32, 1, -2, -13, 4, 15, 6, -17]); /// let count = m128i::from(1_i128); /// let b: [i32; 8] = shr_all_i32_m256i(a, count).into(); /// assert_eq!(b, [0, 0, -1, -7, 2, 7, 3, -9]); /// ``` /// * **Intrinsic:** [`_mm256_sra_epi32`] /// * **Assembly:** `vpsrad ymm, ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_all_i32_m256i(a: m256i, count: m128i) -> m256i { m256i(unsafe { _mm256_sra_epi32(a.0, count.0) }) } /// Shifts all `i16` lanes left by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i16, 2, 3, 4, -1, -2, -3, -4, 1, 2, 3, 4, -1, -2, -3, -4]); /// let c: [i16; 16] = shr_imm_i16_m256i::<1>(a).into(); /// assert_eq!(c, [0_i16, 1, 1, 2, -1, -1, -2, -2, 0, 1, 1, 2, -1, -1, -2, -2]); /// ``` /// * **Intrinsic:** [`_mm256_srai_epi16`] /// * **Assembly:** `vpsraw ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_imm_i16_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_srai_epi16(a.0, IMM) }) } /// Shifts all `i32` lanes left by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i32, 2, 3, 4, -1, -2, -3, -4]); /// let c: [i32; 8] = shr_imm_i32_m256i::<1>(a).into(); /// assert_eq!(c, [0, 1, 1, 2, -1, -1, -2, -2]); /// ``` /// * **Intrinsic:** [`_mm256_srai_epi32`] /// * **Assembly:** `vpsrad ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_imm_i32_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_srai_epi32(a.0, IMM) }) } /// Lanewise `i32` shift right by the matching `i32` lane in `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i32, 1111, -2999, -13888, 5444, 6222, 7333, -11111]); /// let count = m256i::from([1_i32, 2, 3, 4, 5, 4, 3, 2]); /// let b: [i32; 8] = shr_each_i32_m256i(a, count).into(); /// assert_eq!(b, [0, 277, -375, -868, 170, 388, 916, -2778]); /// ``` /// * **Intrinsic:** [`_mm256_srav_epi32`] /// * **Assembly:** `vpsravd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_each_i32_m256i(a: m256i, count: m256i) -> m256i { m256i(unsafe { _mm256_srav_epi32(a.0, count.0) }) } /// Lanewise `u16` shift right by the lower `u64` lane of `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_u16, 6, 2, 5, 4, 3, 1, 0, 12, 13, 56, 21, 8, 7, 6, 5]); /// let count = m128i::from(1_u128); /// let b: [u16; 16] = shr_all_u16_m256i(a, count).into(); /// assert_eq!(b, [2, 3, 1, 2, 2, 1, 0, 0, 6, 6, 28, 10, 4, 3, 3, 2]); /// ``` /// * **Intrinsic:** [`_mm256_srl_epi16`] /// * **Assembly:** `vpsrlw ymm, ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_all_u16_m256i(a: m256i, count: m128i) -> m256i { m256i(unsafe { _mm256_srl_epi16(a.0, count.0) }) } /// Lanewise `u32` shift right by the lower `u64` lane of `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u32, 1, 2, 13, 4, 15, 6, 17]); /// let count = m128i::from(1_u128); /// let b: [u32; 8] = shr_all_u32_m256i(a, count).into(); /// assert_eq!(b, [0, 0, 1, 6, 2, 7, 3, 8]); /// ``` /// * **Intrinsic:** [`_mm256_srl_epi32`] /// * **Assembly:** `vpsrld ymm, ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_all_u32_m256i(a: m256i, count: m128i) -> m256i { m256i(unsafe { _mm256_srl_epi32(a.0, count.0) }) } /// Lanewise `u64` shift right by the lower `u64` lane of `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u64, 1, 2, 13]); /// let count = m128i::from(1_u128); /// let b: [u64; 4] = shr_all_u64_m256i(a, count).into(); /// assert_eq!(b, [0, 0, 1, 6]); /// ``` /// * **Intrinsic:** [`_mm256_srl_epi64`] /// * **Assembly:** `vpsrlq ymm, ymm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_all_u64_m256i(a: m256i, count: m128i) -> m256i { m256i(unsafe { _mm256_srl_epi64(a.0, count.0) }) } /// Shifts all `u16` lanes right by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i16, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); /// let c: [u16; 16] = shr_imm_u16_m256i::<1>(a).into(); /// assert_eq!(c, [0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8]); /// ``` /// * **Intrinsic:** [`_mm256_srli_epi16`] /// * **Assembly:** `vpsrlw ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_imm_u16_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_srli_epi16(a.0, IMM) }) } /// Shifts all `u32` lanes right by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i32, 2, 3, 4, 5, 6, 7, 8]); /// let c: [u32; 8] = shr_imm_u32_m256i::<1>(a).into(); /// assert_eq!(c, [0, 1, 1, 2, 2, 3, 3, 4]); /// ``` /// * **Intrinsic:** [`_mm256_srli_epi32`] /// * **Assembly:** `vpsrld ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_imm_u32_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_srli_epi32(a.0, IMM) }) } /// Shifts all `u64` lanes right by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_u64, 2, 3, 4]); /// let c: [u64; 4] = shr_imm_u64_m256i::<1>(a).into(); /// assert_eq!(c, [0, 1, 1, 2]); /// ``` /// * **Intrinsic:** [`_mm256_srli_epi64`] /// * **Assembly:** `vpsrlq ymm, ymm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_imm_u64_m256i(a: m256i) -> m256i { m256i(unsafe { _mm256_srli_epi64(a.0, IMM) }) } /// Lanewise `u32` shift right by the matching `u32` lane in `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u32, 1111, 2999, 13888, 5444, 6222, 7333, 11111]); /// let count = m256i::from([1_i32, 2, 3, 4, 5, 4, 3, 2]); /// let b: [u32; 8] = shr_each_u32_m256i(a, count).into(); /// assert_eq!(b, [0, 277, 374, 868, 170, 388, 916, 2777]); /// ``` /// * **Intrinsic:** [`_mm256_srlv_epi32`] /// * **Assembly:** `vpsrlvd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_each_u32_m256i(a: m256i, count: m256i) -> m256i { m256i(unsafe { _mm256_srlv_epi32(a.0, count.0) }) } /// Lanewise `u64` shift right by the matching `i64` lane in `count`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u64, 1111, 2999, 13888]); /// let count = m256i::from([1_u64, 2, 3, 4]); /// let b: [u64; 4] = shr_each_u64_m256i(a, count).into(); /// assert_eq!(b, [0, 277, 374, 868]); /// ``` /// * **Intrinsic:** [`_mm256_srlv_epi64`] /// * **Assembly:** `vpsrlvq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn shr_each_u64_m256i(a: m256i, count: m256i) -> m256i { m256i(unsafe { _mm256_srlv_epi64(a.0, count.0) }) } /// Lanewise `a - b` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i8; 32]); /// let b = m256i::from([10_i8; 32]); /// let c: [i8; 32] = sub_i8_m256i(a, b).into(); /// assert_eq!(c, [-5_i8; 32]); /// ``` /// * **Intrinsic:** [`_mm256_sub_epi8`] /// * **Assembly:** `vpsubb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_sub_epi8(a.0, b.0) }) } /// Lanewise `a - b` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16; 16]); /// let b = m256i::from([10_i16; 16]); /// let c: [i16; 16] = sub_i16_m256i(a, b).into(); /// assert_eq!(c, [-5_i16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_sub_epi16`] /// * **Assembly:** `vpsubw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_sub_epi16(a.0, b.0) }) } /// Lanewise `a - b` with lanes as `i32`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i32; 8]); /// let b = m256i::from([10_i32; 8]); /// let c: [i32; 8] = sub_i32_m256i(a, b).into(); /// assert_eq!(c, [-5_i32; 8]); /// ``` /// * **Intrinsic:** [`_mm256_sub_epi32`] /// * **Assembly:** `vpsubd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_sub_epi32(a.0, b.0) }) } /// Lanewise `a - b` with lanes as `i64`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i64; 4]); /// let b = m256i::from([10_i64; 4]); /// let c: [i64; 4] = sub_i64_m256i(a, b).into(); /// assert_eq!(c, [-5_i64; 4]); /// ``` /// * **Intrinsic:** [`_mm256_sub_epi64`] /// * **Assembly:** `vpsubq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_i64_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_sub_epi64(a.0, b.0) }) } /// Lanewise saturating `a - b` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([126_i8; 32]); /// let b = m256i::from([125_i8; 32]); /// let c: [i8; 32] = sub_saturating_i8_m256i(a, b).into(); /// assert_eq!(c, [1_i8; 32]); /// ``` /// * **Intrinsic:** [`_mm256_subs_epi8`] /// * **Assembly:** `vpsubsb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_saturating_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_subs_epi8(a.0, b.0) }) } /// Lanewise saturating `a - b` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([32700_i16; 16]); /// let b = m256i::from([32000_i16; 16]); /// let c: [i16; 16] = sub_saturating_i16_m256i(a, b).into(); /// assert_eq!(c, [700_i16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_subs_epi16`] /// * **Assembly:** `vpsubsw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_saturating_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_subs_epi16(a.0, b.0) }) } /// Lanewise saturating `a - b` with lanes as `u8`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([126_u8; 32]); /// let b = m256i::from([125_u8; 32]); /// let c: [u8; 32] = sub_saturating_u8_m256i(a, b).into(); /// assert_eq!(c, [1_u8; 32]); /// ``` /// * **Intrinsic:** [`_mm256_subs_epu8`] /// * **Assembly:** `vpsubusb ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_saturating_u8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_subs_epu8(a.0, b.0) }) } /// Lanewise saturating `a - b` with lanes as `u16`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([32700_u16; 16]); /// let b = m256i::from([32000_u16; 16]); /// let c: [u16; 16] = sub_saturating_u16_m256i(a, b).into(); /// assert_eq!(c, [700_u16; 16]); /// ``` /// * **Intrinsic:** [`_mm256_subs_epu16`] /// * **Assembly:** `vpsubusw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn sub_saturating_u16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_subs_epu16(a.0, b.0) }) } /// Unpack and interleave high `i8` lanes of `a` and `b`. /// /// * Operates on the high half of each 128 bit portion. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 3_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 7, 11, 2, 13, 4, 15, 6, 17, 8, /// 19, 20, 21, 22, 23, 24, 127, /// ]); /// let b = m256i::from([ /// -1_i8, -1, 0, 2, 2, 3, 0, 5, 6, 6, -7, 8, 8, 0, 0, 10, 10, -11, 11, 12, 12, 13, 13, 12, 11, /// -10, 9, 8, 7, 6, 5, -4, /// ]); /// let c: [i8; 32] = unpack_high_i8_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// 8, 6, 19, 6, 20, -7, 21, 8, 22, 8, 23, 0, 24, 0, 127, 10, 8, 11, 19, -10, 20, 9, 21, 8, 22, /// 7, 23, 6, 24, 5, 127, -4 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_unpackhi_epi8`] /// * **Assembly:** `vpunpckhbw ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn unpack_high_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_unpackhi_epi8(a.0, b.0) }) } /// Unpack and interleave high `i16` lanes of `a` and `b`. /// /// * Operates on the high half of each 128 bit portion. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]); /// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, -8, -7, 6, 5, 0, 0, 0, 978]); /// let c: [i16; 16] = unpack_high_i16_m256i(a, b).into(); /// assert_eq!(c, [4, 0, 3, 1, 1, 2, 0, 3, 8, 0, 7, 0, 6, 0, 5, 978]); /// ``` /// * **Intrinsic:** [`_mm256_unpackhi_epi16`] /// * **Assembly:** `vpunpckhwd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn unpack_high_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_unpackhi_epi16(a.0, b.0) }) } /// Unpack and interleave high `i32` lanes of `a` and `b`. /// /// * Operates on the high half of each 128 bit portion. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]); /// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]); /// let c: [i32; 8] = unpack_high_i32_m256i(a, b).into(); /// assert_eq!(c, [2, 2, 3, -13, 6, 6, 7, -17]); /// ``` /// * **Intrinsic:** [`_mm256_unpackhi_epi32`] /// * **Assembly:** `vpunpckhdq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn unpack_high_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_unpackhi_epi32(a.0, b.0) }) } /// Unpack and interleave high `i64` lanes of `a` and `b`. /// /// * Operates on the high half of each 128 bit portion. /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i64, 2, 3, 4]); /// let b = m256i::from([5_i64, 6, 7, -8]); /// let c: [i64; 4] = unpack_high_i64_m256i(a, b).into(); /// assert_eq!(c, [2, 6, 4, -8]); /// ``` /// * **Intrinsic:** [`_mm256_unpackhi_epi64`] /// * **Assembly:** `vpunpckhqdq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn unpack_high_i64_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_unpackhi_epi64(a.0, b.0) }) } /// Unpack and interleave low `i8` lanes of `a` and `b`. /// /// * Operates on the low half of each 128 bit portion. /// ``` /// # use safe_arch::*; /// let a = m256i::from([ /// 3_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127, 7, 11, 2, 13, 4, 15, 6, 17, 8, /// 19, 20, 21, 22, 23, 24, 127, /// ]); /// let b = m256i::from([ /// -1_i8, -1, 0, 2, 2, 3, 0, 5, 6, 6, -7, 8, 8, 0, 0, 10, 10, -11, 11, 12, 12, 13, 13, 12, 11, /// -10, 9, 8, 7, 6, 5, -4, /// ]); /// let c: [i8; 32] = unpack_low_i8_m256i(a, b).into(); /// assert_eq!( /// c, /// [ /// 3, -1, 11, -1, 2, 0, 13, 2, 4, 2, 15, 3, 6, 0, 17, 5, 7, 10, 11, -11, 2, 11, 13, 12, 4, 12, /// 15, 13, 6, 13, 17, 12 /// ] /// ); /// ``` /// * **Intrinsic:** [`_mm256_unpacklo_epi8`] /// * **Assembly:** `_mm256_unpacklo_epi8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn unpack_low_i8_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_unpacklo_epi8(a.0, b.0) }) } /// Unpack and interleave low `i16` lanes of `a` and `b`. /// /// * Operates on the low half of each 128 bit portion. /// ``` /// # use safe_arch::*; /// let a = m256i::from([5_i16, 6, 2, 5, 4, 3, 1, 0, -12, 13, 56, 21, 8, 7, 6, 5]); /// let b = m256i::from([12000_i16, 13000, -2, -8, 0, 1, 2, 3, -8, -7, 6, 5, 0, 0, 0, 978]); /// let c: [i16; 16] = unpack_low_i16_m256i(a, b).into(); /// assert_eq!(c, [5, 12000, 6, 13000, 2, -2, 5, -8, -12, -8, 13, -7, 56, 6, 21, 5]); /// ``` /// * **Intrinsic:** [`_mm256_unpacklo_epi16`] /// * **Assembly:** `vpunpcklwd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn unpack_low_i16_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_unpacklo_epi16(a.0, b.0) }) } /// Unpack and interleave low `i32` lanes of `a` and `b`. /// /// * Operates on the low half of each 128 bit portion. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i32, 1, 2, 3, 4, 5, 6, 7]); /// let b = m256i::from([0_i32, 11, 2, -13, 4, 15, 6, -17]); /// let c: [i32; 8] = unpack_low_i32_m256i(a, b).into(); /// assert_eq!(c, [0, 0, 1, 11, 4, 4, 5, 15]); /// ``` /// * **Intrinsic:** [`_mm256_unpacklo_epi32`] /// * **Assembly:** `vpunpckldq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn unpack_low_i32_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_unpacklo_epi32(a.0, b.0) }) } /// Unpack and interleave low `i64` lanes of `a` and `b`. /// /// * Operates on the low half of each 128 bit portion. /// ``` /// # use safe_arch::*; /// let a = m256i::from([1_i64, 2, 3, 4]); /// let b = m256i::from([5_i64, 6, 7, -8]); /// let c: [i64; 4] = unpack_low_i64_m256i(a, b).into(); /// assert_eq!(c, [1, 5, 3, 7]); /// ``` /// * **Intrinsic:** [`_mm256_unpacklo_epi64`] /// * **Assembly:** `vpunpcklqdq ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn unpack_low_i64_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_unpacklo_epi64(a.0, b.0) }) } /// Bitwise `a ^ b`. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i64, 0, 1, 1]); /// let b = m256i::from([0_i64, 1, 0, 1]); /// let c: [i64; 4] = bitxor_m256i(a, b).into(); /// assert_eq!(c, [0_i64, 1, 1, 0]); /// ``` /// * **Intrinsic:** [`_mm256_xor_si256`] /// * **Assembly:** `vpxor ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "avx2")))] pub fn bitxor_m256i(a: m256i, b: m256i) -> m256i { m256i(unsafe { _mm256_xor_si256(a.0, b.0) }) } impl Not for m256i { type Output = Self; /// Not a direct intrinsic, but it's very useful and the implementation is /// simple enough. /// /// Negates the bits by performing an `xor` with an all-1s bit pattern. /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_u128, 0]); /// let c: [u128; 2] = (!a).into(); /// assert_eq!(c, [u128::MAX, u128::MAX]); /// ``` #[must_use] #[inline(always)] fn not(self) -> Self { let all_bits = set_splat_i16_m256i(-1); self ^ all_bits } } impl BitAnd for m256i { type Output = Self; /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i64, 0, 1, 1]); /// let b = m256i::from([0_i64, 1, 0, 1]); /// let c: [i64; 4] = (a & b).into(); /// assert_eq!(c, [0_i64, 0, 0, 1]); /// ``` #[must_use] #[inline(always)] fn bitand(self, rhs: Self) -> Self { bitand_m256i(self, rhs) } } impl BitAndAssign for m256i { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = *self & rhs; } } impl BitOr for m256i { type Output = Self; /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i64, 0, 1, 1]); /// let b = m256i::from([0_i64, 1, 0, 1]); /// let c: [i64; 4] = (a | b).into(); /// assert_eq!(c, [0_i64, 1, 1, 1]); /// ``` #[must_use] #[inline(always)] fn bitor(self, rhs: Self) -> Self { bitor_m256i(self, rhs) } } impl BitOrAssign for m256i { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = *self | rhs; } } impl BitXor for m256i { type Output = Self; /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i64, 0, 1, 1]); /// let b = m256i::from([0_i64, 1, 0, 1]); /// let c: [i64; 4] = (a ^ b).into(); /// assert_eq!(c, [0_i64, 1, 1, 0]); /// ``` #[must_use] #[inline(always)] fn bitxor(self, rhs: Self) -> Self { bitxor_m256i(self, rhs) } } impl BitXorAssign for m256i { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = *self ^ rhs; } } impl PartialEq for m256i { #[must_use] #[inline(always)] /// ``` /// # use safe_arch::*; /// let a = m256i::from([0_i64, 0, 1, 1]); /// let b = m256i::from([0_i64, 1, 0, 1]); /// assert_eq!(a, a); /// assert_ne!(a, b); /// ``` fn eq(&self, other: &Self) -> bool { let mask = cmp_eq_mask_i8_m256i(*self, *other); move_mask_i8_m256i(mask) == -1_i32 } } impl Eq for m256i {} safe_arch-0.7.1/src/x86_x64/bmi1.rs000066400000000000000000000121421445526200400165470ustar00rootroot00000000000000#![cfg(target_feature = "bmi1")] use super::*; /// Bitwise `(!a) & b` for `u32` /// /// * **Intrinsic:** [`_andn_u32`] /// * **Assembly:** `andn r32, r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bitandnot_u32(a: u32, b: u32) -> u32 { unsafe { _andn_u32(a, b) } } /// Bitwise `(!a) & b` for `u64` /// /// * **Intrinsic:** [`_andn_u64`] /// * **Assembly:** `andn r64, r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bitandnot_u64(a: u64, b: u64) -> u64 { unsafe { _andn_u64(a, b) } } /// Extract a span of bits from the `u32`, start and len style. /// /// * **Intrinsic:** [`_bextr_u32`] /// * **Assembly:** `bextr r32, r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_extract_u32(a: u32, start: u32, len: u32) -> u32 { unsafe { _bextr_u32(a, start, len) } } /// Extract a span of bits from the `u64`, start and len style. /// /// * **Intrinsic:** [`_bextr_u64`] /// * **Assembly:** `bextr r64, r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_extract_u64(a: u64, start: u32, len: u32) -> u64 { unsafe { _bextr_u64(a, start, len) } } /// Extract a span of bits from the `u32`, control value style. /// /// * Bits 0 through 7: start position. /// * Bits 8 through 15: span length. /// /// * **Intrinsic:** [`_bextr2_u32`] /// * **Assembly:** `bextr r32, r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_extract2_u32(a: u32, control: u32) -> u32 { unsafe { _bextr2_u32(a, control) } } /// Extract a span of bits from the `u64`, control value style. /// /// * Bits 0 through 7: start position. /// * Bits 8 through 15: span length. /// /// * **Intrinsic:** [`_bextr2_u64`] /// * **Assembly:** `bextr r64, r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_extract2_u64(a: u64, control: u64) -> u64 { unsafe { _bextr2_u64(a, control) } } /// Gets the *value* of the lowest set bit in a `u32`. /// /// If the input is 0 you get 0 back. /// /// * Formula: `(!a) & a` /// /// * **Intrinsic:** [`_blsi_u32`] /// * **Assembly:** `blsi r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_lowest_set_value_u32(a: u32) -> u32 { unsafe { _blsi_u32(a) } } /// Gets the *value* of the lowest set bit in a `u64`. /// /// If the input is 0 you get 0 back. /// /// * Formula: `(!a) & a` /// /// * **Intrinsic:** [`_blsi_u64`] /// * **Assembly:** `blsi r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_lowest_set_value_u64(a: u64) -> u64 { unsafe { _blsi_u64(a) } } /// Gets the mask of all bits up to and including the lowest set bit in a `u32`. /// /// If the input is 0, you get `u32::MAX` /// /// * Formula: `(a - 1) ^ a` /// /// * **Intrinsic:** [`_blsmsk_u32`] /// * **Assembly:** `blsmsk r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_lowest_set_mask_u32(a: u32) -> u32 { unsafe { _blsmsk_u32(a) } } /// Gets the mask of all bits up to and including the lowest set bit in a `u64`. /// /// If the input is 0, you get `u64::MAX` /// /// * Formula: `(a - 1) ^ a` /// /// * **Intrinsic:** [`_blsmsk_u64`] /// * **Assembly:** `blsmsk r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_lowest_set_mask_u64(a: u64) -> u64 { unsafe { _blsmsk_u64(a) } } /// Resets (clears) the lowest set bit. /// /// If the input is 0 you get 0 back. /// /// * Formula: `(a - 1) & a` /// /// * **Intrinsic:** [`_blsr_u32`] /// * **Assembly:** `blsr r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_lowest_set_reset_u32(a: u32) -> u32 { unsafe { _blsr_u32(a) } } /// Resets (clears) the lowest set bit. /// /// If the input is 0 you get 0 back. /// /// * Formula: `(a - 1) & a` /// /// * **Intrinsic:** [`_blsr_u64`] /// * **Assembly:** `blsr r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn bit_lowest_set_reset_u64(a: u64) -> u64 { unsafe { _blsr_u64(a) } } /// Counts the number of trailing zero bits in a `u32`. /// /// An input of 0 gives 32. /// /// * **Intrinsic:** [`_tzcnt_u32`] /// * **Assembly:** `tzcnt r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn trailing_zero_count_u32(a: u32) -> u32 { unsafe { _tzcnt_u32(a) } } /// Counts the number of trailing zero bits in a `u64`. /// /// An input of 0 gives 64. /// /// * **Intrinsic:** [`_tzcnt_u64`] /// * **Assembly:** `tzcnt r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi1")))] pub fn trailing_zero_count_u64(a: u64) -> u64 { unsafe { _tzcnt_u64(a) } } safe_arch-0.7.1/src/x86_x64/bmi2.rs000066400000000000000000000057011445526200400165530ustar00rootroot00000000000000#![cfg(target_feature = "bmi2")] use super::*; /// Zero out all high bits in a `u32` starting at the index given. /// /// * **Intrinsic:** [`_bzhi_u32`] /// * **Assembly:** `bzhi r32, r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi2")))] pub fn bit_zero_high_index_u32(a: u32, index: u32) -> u32 { unsafe { _bzhi_u32(a, index) } } /// Zero out all high bits in a `u64` starting at the index given. /// /// * **Intrinsic:** [`_bzhi_u64`] /// * **Assembly:** `bzhi r64, r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi2")))] pub fn bit_zero_high_index_u64(a: u64, index: u32) -> u64 { unsafe { _bzhi_u64(a, index) } } /// Multiply two `u32`, outputting the low bits and storing the high bits in the /// reference. /// /// This does not read or write arithmetic flags. /// /// * **Intrinsic:** [`_mulx_u32`] /// * **Assembly:** `mulx r32, r32, m32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi2")))] pub fn mul_extended_u32(a: u32, b: u32, extra: &mut u32) -> u32 { unsafe { _mulx_u32(a, b, extra) } } /// Multiply two `u64`, outputting the low bits and storing the high bits in the /// reference. /// /// This does not read or write arithmetic flags. /// /// * **Intrinsic:** [`_mulx_u64`] /// * **Assembly:** `mulx r64, r64, m64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi2")))] pub fn mul_extended_u64(a: u64, b: u64, extra: &mut u64) -> u64 { unsafe { _mulx_u64(a, b, extra) } } /// Deposit contiguous low bits from a `u32` according to a mask. /// /// Other bits are zero. /// /// * **Intrinsic:** [`_pdep_u32`] /// * **Assembly:** `pdep r32, r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi2")))] pub fn population_deposit_u32(a: u32, index: u32) -> u32 { unsafe { _pdep_u32(a, index) } } /// Deposit contiguous low bits from a `u64` according to a mask. /// /// Other bits are zero. /// /// * **Intrinsic:** [`_pdep_u64`] /// * **Assembly:** `pdep r64, r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi2")))] pub fn population_deposit_u64(a: u64, index: u64) -> u64 { unsafe { _pdep_u64(a, index) } } /// Extract bits from a `u32` according to a mask. /// /// * **Intrinsic:** [`_pext_u32`] /// * **Assembly:** `pext r32, r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi2")))] pub fn population_extract_u32(a: u32, index: u32) -> u32 { unsafe { _pext_u32(a, index) } } /// Extract bits from a `u64` according to a mask. /// /// * **Intrinsic:** [`_pext_u64`] /// * **Assembly:** `pext r64, r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "bmi2")))] pub fn population_extract_u64(a: u64, index: u64) -> u64 { unsafe { _pext_u64(a, index) } } safe_arch-0.7.1/src/x86_x64/fma.rs000066400000000000000000000567431445526200400165010ustar00rootroot00000000000000#![cfg(target_feature = "fma")] use super::*; // Note(Lokathor): There's only a few significant operations here but each // operation has variants for each of the four floating SIMD types (m128, m128d, // m256, and m256d), as well as possibly working on the scalar forms. We should // try to keep each operation grouping sorted into the order: m128, m128_s, // m128d, m128d_s, m256, m256d // // MUL ADD // /// Lanewise fused `(a * b) + c` /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_add_m128(a, b, c).to_array(); /// assert_eq!(d, [9.0, 16.0, 25.0, 36.0]); /// ``` /// * **Intrinsic:** [`_mm_fmadd_ps`] /// * **Assembly:** one of /// * `vfmadd132ps xmm, xmm, xmm` /// * `vfmadd213ps xmm, xmm, xmm` /// * `vfmadd231ps xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_add_m128(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fmadd_ps(a.0, b.0, c.0) }) } /// Low lane fused `(a * b) + c`, other lanes unchanged /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_add_m128_s(a, b, c).to_array(); /// assert_eq!(d, [9.0, 3.0, 4.0, 5.0]); /// ``` /// * **Intrinsic:** [`_mm_fmadd_ss`] /// * **Assembly:** one of /// * `vfmadd132ss xmm, xmm, xmm` /// * `vfmadd213ss xmm, xmm, xmm` /// * `vfmadd231ss xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_add_m128_s(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fmadd_ss(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) + c` /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_add_m128d(a, b, c).to_array(); /// assert_eq!(d, [9.0, 16.0]); /// ``` /// * **Intrinsic:** [`_mm_fmadd_pd`] /// * **Assembly:** one of /// * `vfmadd132pd xmm, xmm, xmm` /// * `vfmadd213pd xmm, xmm, xmm` /// * `vfmadd231pd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_add_m128d(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fmadd_pd(a.0, b.0, c.0) }) } /// Low lane fused `(a * b) + c`, other lanes unchanged /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_add_m128d_s(a, b, c).to_array(); /// assert_eq!(d, [9.0, 3.0]); /// ``` /// * **Intrinsic:** [`_mm_fmadd_sd`] /// * **Assembly:** one of /// * `vfmadd132sd xmm, xmm, xmm` /// * `vfmadd213sd xmm, xmm, xmm` /// * `vfmadd231sd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_add_m128d_s(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fmadd_sd(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) + c` /// ``` /// # use safe_arch::*; /// let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0]); /// let b = m256::from_array([5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0]); /// let c = m256::from_array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_add_m256(a, b, c).to_array(); /// assert_eq!(d, [6.0, 13.0, 22.0, 33.0, 6.0, 13.0, 22.0, 33.0]); /// ``` /// * **Intrinsic:** [`_mm256_fmadd_ps`] /// * **Assembly:** one of /// * `vfmadd132ps ymm, ymm, ymm` /// * `vfmadd213ps ymm, ymm, ymm` /// * `vfmadd231ps ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_add_m256(a: m256, b: m256, c: m256) -> m256 { m256(unsafe { _mm256_fmadd_ps(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) + c` /// ``` /// # use safe_arch::*; /// let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m256d::from_array([5.0, 6.0, 7.0, 8.0]); /// let c = m256d::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_add_m256d(a, b, c).to_array(); /// assert_eq!(d, [6.0, 13.0, 22.0, 33.0]); /// ``` /// * **Intrinsic:** [`_mm256_fmadd_pd`] /// * **Assembly:** one of /// * `vfmadd132pd ymm, ymm, ymm` /// * `vfmadd213pd ymm, ymm, ymm` /// * `vfmadd231pd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_add_m256d(a: m256d, b: m256d, c: m256d) -> m256d { m256d(unsafe { _mm256_fmadd_pd(a.0, b.0, c.0) }) } // // MUL SUB // /// Lanewise fused `(a * b) - c` /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_sub_m128(a, b, c).to_array(); /// assert_eq!(d, [7.0, 14.0, 23.0, 34.0]); /// ``` /// * **Intrinsic:** [`_mm_fmsub_ps`] /// * **Assembly:** one of /// * `vfmsub132ps xmm, xmm, xmm` /// * `vfmsub213ps xmm, xmm, xmm` /// * `vfmsub231ps xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_sub_m128(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fmsub_ps(a.0, b.0, c.0) }) } /// Low lane fused `(a * b) - c`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_sub_m128_s(a, b, c).to_array(); /// assert_eq!(d, [7.0, 3.0, 4.0, 5.0]); /// ``` /// * **Intrinsic:** [`_mm_fmsub_ss`] /// * **Assembly:** one of /// * `vfmsub132ss xmm, xmm, xmm` /// * `vfmsub213ss xmm, xmm, xmm` /// * `vfmsub231ss xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_sub_m128_s(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fmsub_ss(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) - c` /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_sub_m128d(a, b, c).to_array(); /// assert_eq!(d, [7.0, 14.0]); /// ``` /// * **Intrinsic:** [`_mm_fmsub_pd`] /// * **Assembly:** one of /// * `vfmsub132pd xmm, xmm, xmm` /// * `vfmsub213pd xmm, xmm, xmm` /// * `vfmsub231pd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_sub_m128d(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fmsub_pd(a.0, b.0, c.0) }) } /// Low lane fused `(a * b) - c`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_sub_m128d_s(a, b, c).to_array(); /// assert_eq!(d, [7.0, 3.0]); /// ``` /// * **Intrinsic:** [`_mm_fmsub_sd`] /// * **Assembly:** one of /// * `vfmsub132sd xmm, xmm, xmm` /// * `vfmsub213sd xmm, xmm, xmm` /// * `vfmsub231sd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_sub_m128d_s(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fmsub_sd(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) - c` /// ``` /// # use safe_arch::*; /// let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0]); /// let b = m256::from_array([5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0]); /// let c = m256::from_array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_sub_m256(a, b, c).to_array(); /// assert_eq!(d, [4.0, 11.0, 20.0, 31.0, 4.0, 11.0, 20.0, 31.0]); /// ``` /// * **Intrinsic:** [`_mm256_fmsub_ps`] /// * **Assembly:** one of /// * `vfmsub132ps ymm, ymm, ymm` /// * `vfmsub213ps ymm, ymm, ymm` /// * `vfmsub231ps ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_sub_m256(a: m256, b: m256, c: m256) -> m256 { m256(unsafe { _mm256_fmsub_ps(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) - c` /// ``` /// # use safe_arch::*; /// let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m256d::from_array([5.0, 6.0, 7.0, 8.0]); /// let c = m256d::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_sub_m256d(a, b, c).to_array(); /// assert_eq!(d, [4.0, 11.0, 20.0, 31.0]); /// ``` /// * **Intrinsic:** [`_mm256_fmsub_pd`] /// * **Assembly:** one of /// * `vfmsub132pd ymm, ymm, ymm` /// * `vfmsub213pd ymm, ymm, ymm` /// * `vfmsub231pd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_sub_m256d(a: m256d, b: m256d, c: m256d) -> m256d { m256d(unsafe { _mm256_fmsub_pd(a.0, b.0, c.0) }) } // // MUL NEG ADD // /// Lanewise fused `-(a * b) + c` /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_neg_add_m128(a, b, c).to_array(); /// assert_eq!(d, [-7.0, -14.0, -23.0, -34.0]); /// ``` /// * **Intrinsic:** [`_mm_fnmadd_ps`] /// * **Assembly:** one of /// * `vfnmadd132ps xmm, xmm, xmm` /// * `vfnmadd213ps xmm, xmm, xmm` /// * `vfnmadd231ps xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_add_m128(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fnmadd_ps(a.0, b.0, c.0) }) } /// Low lane `-(a * b) + c`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_neg_add_m128_s(a, b, c).to_array(); /// assert_eq!(d, [-7.0, 3.0, 4.0, 5.0]); /// ``` /// * **Intrinsic:** [`_mm_fnmadd_ss`] /// * **Assembly:** one of /// * `vfnmadd132ss xmm, xmm, xmm` /// * `vfnmadd213ss xmm, xmm, xmm` /// * `vfnmadd231ss xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_add_m128_s(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fnmadd_ss(a.0, b.0, c.0) }) } /// Lanewise fused `-(a * b) + c` /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_neg_add_m128d(a, b, c).to_array(); /// assert_eq!(d, [-7.0, -14.0]); /// ``` /// * **Intrinsic:** [`_mm_fnmadd_pd`] /// * **Assembly:** one of /// * `vfnmadd132pd xmm, xmm, xmm` /// * `vfnmadd213pd xmm, xmm, xmm` /// * `vfnmadd231pd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_add_m128d(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fnmadd_pd(a.0, b.0, c.0) }) } /// Low lane `-(a * b) + c`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_neg_add_m128d_s(a, b, c).to_array(); /// assert_eq!(d, [-7.0, 3.0]); /// ``` /// * **Intrinsic:** [`_mm_fnmadd_sd`] /// * **Assembly:** one of /// * `vfnmadd132sd xmm, xmm, xmm` /// * `vfnmadd213sd xmm, xmm, xmm` /// * `vfnmadd231sd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_add_m128d_s(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fnmadd_sd(a.0, b.0, c.0) }) } /// Lanewise fused `-(a * b) + c` /// ``` /// # use safe_arch::*; /// let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0]); /// let b = m256::from_array([5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0]); /// let c = m256::from_array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_neg_add_m256(a, b, c).to_array(); /// assert_eq!(d, [-4.0, -11.0, -20.0, -31.0, -4.0, -11.0, -20.0, -31.0]); /// ``` /// * **Intrinsic:** [`_mm256_fnmadd_ps`] /// * **Assembly:** one of /// * `vfnmadd132ps ymm, ymm, ymm` /// * `vfnmadd213ps ymm, ymm, ymm` /// * `vfnmadd231ps ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_add_m256(a: m256, b: m256, c: m256) -> m256 { m256(unsafe { _mm256_fnmadd_ps(a.0, b.0, c.0) }) } /// Lanewise fused `-(a * b) + c` /// ``` /// # use safe_arch::*; /// let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m256d::from_array([5.0, 6.0, 7.0, 8.0]); /// let c = m256d::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_neg_add_m256d(a, b, c).to_array(); /// assert_eq!(d, [-4.0, -11.0, -20.0, -31.0]); /// ``` /// * **Intrinsic:** [`_mm256_fnmadd_pd`] /// * **Assembly:** one of /// * `vfnmadd132pd ymm, ymm, ymm` /// * `vfnmadd213pd ymm, ymm, ymm` /// * `vfnmadd231pd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_add_m256d(a: m256d, b: m256d, c: m256d) -> m256d { m256d(unsafe { _mm256_fnmadd_pd(a.0, b.0, c.0) }) } // // MUL NEG SUB // /// Lanewise fused `-(a * b) - c` /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_neg_sub_m128(a, b, c).to_array(); /// assert_eq!(d, [-9.0, -16.0, -25.0, -36.0]); /// ``` /// * **Intrinsic:** [`_mm_fnmsub_ps`] /// * **Assembly:** one of /// * `vfnmsub132ps xmm, xmm, xmm` /// * `vfnmsub213ps xmm, xmm, xmm` /// * `vfnmsub231ps xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_sub_m128(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fnmsub_ps(a.0, b.0, c.0) }) } /// Low lane fused `-(a * b) - c`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_neg_sub_m128_s(a, b, c).to_array(); /// assert_eq!(d, [-9.0, 3.0, 4.0, 5.0]); /// ``` /// * **Intrinsic:** [`_mm_fnmsub_ss`] /// * **Assembly:** one of /// * `vfnmsub132ss xmm, xmm, xmm` /// * `vfnmsub213ss xmm, xmm, xmm` /// * `vfnmsub231ss xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_sub_m128_s(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fnmsub_ss(a.0, b.0, c.0) }) } /// Lanewise fused `-(a * b) - c` /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_neg_sub_m128d(a, b, c).to_array(); /// assert_eq!(d, [-9.0, -16.0]); /// ``` /// * **Intrinsic:** [`_mm_fnmsub_pd`] /// * **Assembly:** one of /// * `vfnmsub132pd xmm, xmm, xmm` /// * `vfnmsub213pd xmm, xmm, xmm` /// * `vfnmsub231pd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_sub_m128d(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fnmsub_pd(a.0, b.0, c.0) }) } /// Low lane fused `-(a * b) - c`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_neg_sub_m128d_s(a, b, c).to_array(); /// assert_eq!(d, [-9.0, 3.0]); /// ``` /// * **Intrinsic:** [`_mm_fnmsub_sd`] /// * **Assembly:** one of /// * `vfnmsub132sd xmm, xmm, xmm` /// * `vfnmsub213sd xmm, xmm, xmm` /// * `vfnmsub231sd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_sub_m128d_s(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fnmsub_sd(a.0, b.0, c.0) }) } /// Lanewise fused `-(a * b) - c` /// ``` /// # use safe_arch::*; /// let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0]); /// let b = m256::from_array([5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0]); /// let c = m256::from_array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_neg_sub_m256(a, b, c).to_array(); /// assert_eq!(d, [-6.0, -13.0, -22.0, -33.0, -6.0, -13.0, -22.0, -33.0]); /// ``` /// * **Intrinsic:** [`_mm256_fnmsub_ps`] /// * **Assembly:** one of /// * `vfnmsub132ps ymm, ymm, ymm` /// * `vfnmsub213ps ymm, ymm, ymm` /// * `vfnmsub231ps ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_sub_m256(a: m256, b: m256, c: m256) -> m256 { m256(unsafe { _mm256_fnmsub_ps(a.0, b.0, c.0) }) } /// Lanewise fused `-(a * b) - c` /// ``` /// # use safe_arch::*; /// let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m256d::from_array([5.0, 6.0, 7.0, 8.0]); /// let c = m256d::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_neg_sub_m256d(a, b, c).to_array(); /// assert_eq!(d, [-6.0, -13.0, -22.0, -33.0]); /// ``` /// * **Intrinsic:** [`_mm256_fnmsub_pd`] /// * **Assembly:** one of /// * `vfnmsub132pd ymm, ymm, ymm` /// * `vfnmsub213pd ymm, ymm, ymm` /// * `vfnmsub231pd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_neg_sub_m256d(a: m256d, b: m256d, c: m256d) -> m256d { m256d(unsafe { _mm256_fnmsub_pd(a.0, b.0, c.0) }) } // // MUL ADDSUB // /// Lanewise fused `(a * b) addsub c` (adds odd lanes and subtracts even lanes) /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_addsub_m128(a, b, c).to_array(); /// assert_eq!(d, [7.0, 16.0, 23.0, 36.0]); /// ``` /// * **Intrinsic:** [`_mm_fmaddsub_ps`] /// * **Assembly:** one of /// * `vfmaddsub132ps xmm, xmm, xmm` /// * `vfmaddsub213ps xmm, xmm, xmm` /// * `vfmaddsub231ps xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_addsub_m128(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fmaddsub_ps(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) addsub c` (adds odd lanes and subtracts even lanes) /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_addsub_m128d(a, b, c).to_array(); /// assert_eq!(d, [7.0, 16.0]); /// ``` /// * **Intrinsic:** [`_mm_fmaddsub_pd`] /// * **Assembly:** one of /// * `vfmaddsub132pd xmm, xmm, xmm` /// * `vfmaddsub213pd xmm, xmm, xmm` /// * `vfmaddsub231pd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_addsub_m128d(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fmaddsub_pd(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) addsub c` (adds odd lanes and subtracts even lanes) /// ``` /// # use safe_arch::*; /// let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0]); /// let b = m256::from_array([5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0]); /// let c = m256::from_array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_addsub_m256(a, b, c).to_array(); /// assert_eq!(d, [4.0, 13.0, 20.0, 33.0, 4.0, 13.0, 20.0, 33.0]); /// ``` /// * **Intrinsic:** [`_mm256_fmaddsub_ps`] /// * **Assembly:** one of /// * `vfmaddsub132ps ymm, ymm, ymm` /// * `vfmaddsub213ps ymm, ymm, ymm` /// * `vfmaddsub231ps ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_addsub_m256(a: m256, b: m256, c: m256) -> m256 { m256(unsafe { _mm256_fmaddsub_ps(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) addsub c` (adds odd lanes and subtracts even lanes) /// ``` /// # use safe_arch::*; /// let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m256d::from_array([5.0, 6.0, 7.0, 8.0]); /// let c = m256d::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_addsub_m256d(a, b, c).to_array(); /// assert_eq!(d, [4.0, 13.0, 20.0, 33.0]); /// ``` /// * **Intrinsic:** [`_mm256_fmaddsub_pd`] /// * **Assembly:** one of /// * `vfmaddsub132pd ymm, ymm, ymm` /// * `vfmaddsub213pd ymm, ymm, ymm` /// * `vfmaddsub231pd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_addsub_m256d(a: m256d, b: m256d, c: m256d) -> m256d { m256d(unsafe { _mm256_fmaddsub_pd(a.0, b.0, c.0) }) } // // MUL SUBADD // /// Lanewise fused `(a * b) subadd c` (subtracts odd lanes and adds even lanes) /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 3.0, 4.0, 5.0]); /// let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); /// let c = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_subadd_m128(a, b, c).to_array(); /// assert_eq!(d, [9.0, 14.0, 25.0, 34.0]); /// ``` /// * **Intrinsic:** [`_mm_fmsubadd_ps`] /// * **Assembly:** /// * `vfmsubadd132ps xmm, xmm, xmm` /// * `vfmsubadd213ps xmm, xmm, xmm` /// * `vfmsubadd231ps xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_subadd_m128(a: m128, b: m128, c: m128) -> m128 { m128(unsafe { _mm_fmsubadd_ps(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) subadd c` (subtracts odd lanes and adds even lanes) /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 3.0]); /// let b = m128d::from_array([4.0, 5.0]); /// let c = m128d::from_array([1.0, 1.0]); /// let d = fused_mul_subadd_m128d(a, b, c).to_array(); /// assert_eq!(d, [9.0, 14.0]); /// ``` /// * **Intrinsic:** [`_mm_fmsubadd_pd`] /// * **Assembly:** one of /// * `vfmsubadd132pd xmm, xmm, xmm` /// * `vfmsubadd213pd xmm, xmm, xmm` /// * `vfmsubadd231pd xmm, xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_subadd_m128d(a: m128d, b: m128d, c: m128d) -> m128d { m128d(unsafe { _mm_fmsubadd_pd(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) subadd c` (subtracts odd lanes and adds even lanes) /// ``` /// # use safe_arch::*; /// let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0]); /// let b = m256::from_array([5.0, 6.0, 7.0, 8.0, 5.0, 6.0, 7.0, 8.0]); /// let c = m256::from_array([1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_subadd_m256(a, b, c).to_array(); /// assert_eq!(d, [6.0, 11.0, 22.0, 31.0, 6.0, 11.0, 22.0, 31.0]); /// ``` /// * **Intrinsic:** [`_mm256_fmsubadd_ps`] /// * **Assembly:** one of /// * `vfmsubadd132ps ymm, ymm, ymm` /// * `vfmsubadd213ps ymm, ymm, ymm` /// * `vfmsubadd231ps ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_subadd_m256(a: m256, b: m256, c: m256) -> m256 { m256(unsafe { _mm256_fmsubadd_ps(a.0, b.0, c.0) }) } /// Lanewise fused `(a * b) subadd c` (subtracts odd lanes and adds even lanes) /// ``` /// # use safe_arch::*; /// let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m256d::from_array([5.0, 6.0, 7.0, 8.0]); /// let c = m256d::from_array([1.0, 1.0, 1.0, 1.0]); /// let d = fused_mul_subadd_m256d(a, b, c).to_array(); /// assert_eq!(d, [6.0, 11.0, 22.0, 31.0]); /// ``` /// * **Intrinsic:** [`_mm256_fmsubadd_pd`] /// * **Assembly:** one of /// * `vfmsubadd132pd ymm, ymm, ymm` /// * `vfmsubadd213pd ymm, ymm, ymm` /// * `vfmsubadd231pd ymm, ymm, ymm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "fma")))] pub fn fused_mul_subadd_m256d(a: m256d, b: m256d, c: m256d) -> m256d { m256d(unsafe { _mm256_fmsubadd_pd(a.0, b.0, c.0) }) } safe_arch-0.7.1/src/x86_x64/lzcnt.rs000066400000000000000000000012051445526200400170470ustar00rootroot00000000000000#![cfg(target_feature = "lzcnt")] use super::*; /// Count the leading zeroes in a `u32`. /// /// * **Intrinsic:** [`_lzcnt_u32`] /// * **Assembly:** `lzcnt r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "lzcnt")))] pub fn leading_zero_count_u32(a: u32) -> u32 { unsafe { _lzcnt_u32(a) } } /// Count the leading zeroes in a `u64`. /// /// * **Intrinsic:** [`_lzcnt_u64`] /// * **Assembly:** `lzcnt r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "lzcnt")))] pub fn leading_zero_count_u64(a: u64) -> u64 { unsafe { _lzcnt_u64(a) } } safe_arch-0.7.1/src/x86_x64/m128_.rs000066400000000000000000000126101445526200400165450ustar00rootroot00000000000000//! This module is for the `m128` wrapper type, its bonus methods, and all //! necessary trait impls. //! //! Intrinsics should _not_ be in this module! They should all be free-functions //! in the other modules, sorted by CPU target feature. use super::*; /// The data for a 128-bit SSE register of four `f32` lanes. /// /// * This is _very similar to_ having `[f32; 4]`. The main difference is that /// it's aligned to 16 instead of just 4, and of course you can perform /// various intrinsic operations on it. #[repr(transparent)] #[allow(non_camel_case_types)] pub struct m128(pub __m128); #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Zeroable for m128 {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Pod for m128 {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::TransparentWrapper<__m128> for m128 {} impl m128 { /// Transmutes the `m128` to an array. /// /// Same as `m.into()`, just lets you be more explicit about what's happening. #[must_use] #[inline(always)] pub fn to_array(self) -> [f32; 4] { self.into() } /// Transmutes an array into `m128`. /// /// Same as `m128::from(arr)`, it just lets you be more explicit about what's /// happening. #[must_use] #[inline(always)] pub fn from_array(f: [f32; 4]) -> Self { f.into() } // /// Converts into the bit patterns of these floats (`[u32;4]`). /// /// Like [`f32::to_bits`](f32::to_bits), but all four lanes at once. #[must_use] #[inline(always)] pub fn to_bits(self) -> [u32; 4] { unsafe { core::mem::transmute(self) } } /// Converts from the bit patterns of these floats (`[u32;4]`). /// /// Like [`f32::from_bits`](f32::from_bits), but all four lanes at once. #[must_use] #[inline(always)] pub fn from_bits(bits: [u32; 4]) -> Self { unsafe { core::mem::transmute(bits) } } } impl Clone for m128 { #[must_use] #[inline(always)] fn clone(&self) -> Self { *self } } impl Copy for m128 {} impl Default for m128 { #[must_use] #[inline(always)] fn default() -> Self { unsafe { core::mem::zeroed() } } } impl From<[f32; 4]> for m128 { #[must_use] #[inline(always)] fn from(arr: [f32; 4]) -> Self { // Safety: because this semantically moves the value from the input position // (align4) to the output position (align16) it is fine to increase our // required alignment without worry. unsafe { core::mem::transmute(arr) } } } impl From for [f32; 4] { #[must_use] #[inline(always)] fn from(m: m128) -> Self { // We can of course transmute to a lower alignment unsafe { core::mem::transmute(m) } } } // // PLEASE KEEP ALL THE FORMAT IMPL JUNK AT THE END OF THE FILE // impl Debug for m128 { /// Debug formats each float. fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "m128(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } Debug::fmt(float, f)?; } write!(f, ")") } } impl Display for m128 { /// Display formats each float, and leaves the type name off of the font. fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } Display::fmt(float, f)?; } write!(f, ")") } } impl Binary for m128 { /// Binary formats each float's bit pattern (via [`f32::to_bits`]). fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } Binary::fmt(&float.to_bits(), f)?; } write!(f, ")") } } impl LowerExp for m128 { /// LowerExp formats each float. fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } LowerExp::fmt(float, f)?; } write!(f, ")") } } impl UpperExp for m128 { /// UpperExp formats each float. fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } UpperExp::fmt(float, f)?; } write!(f, ")") } } impl LowerHex for m128 { /// LowerHex formats each float's bit pattern (via [`f32::to_bits`]). fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } LowerHex::fmt(&float.to_bits(), f)?; } write!(f, ")") } } impl UpperHex for m128 { /// UpperHex formats each float's bit pattern (via [`f32::to_bits`]). fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } UpperHex::fmt(&float.to_bits(), f)?; } write!(f, ")") } } impl Octal for m128 { /// Octal formats each float's bit pattern (via [`f32::to_bits`]). fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } Octal::fmt(&float.to_bits(), f)?; } write!(f, ")") } } safe_arch-0.7.1/src/x86_x64/m128d_.rs000066400000000000000000000142261445526200400167160ustar00rootroot00000000000000//! This module is for the `m128d` wrapper type, its bonus methods, and all //! necessary trait impls. //! //! Intrinsics should _not_ be in this module! They should all be free-functions //! in the other modules, sorted by CPU target feature. use super::*; /// The data for a 128-bit SSE register of two `f64` values. /// /// * This is _very similar to_ having `[f64; 2]`. The main difference is that /// it's aligned to 16 instead of just 4, and of course you can perform /// various intrinsic operations on it. #[repr(transparent)] #[allow(non_camel_case_types)] pub struct m128d(pub __m128d); #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Zeroable for m128d {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Pod for m128d {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::TransparentWrapper<__m128d> for m128d {} impl m128d { /// Transmutes the `m128d` to an array. /// /// Same as `m.into()`, just lets you be more explicit about what's happening. #[must_use] #[inline(always)] pub fn to_array(self) -> [f64; 2] { self.into() } /// Transmutes an array into `m128d`. /// /// Same as `m128d::from(arr)`, it just lets you be more explicit about what's /// happening. #[must_use] #[inline(always)] pub fn from_array(f: [f64; 2]) -> Self { f.into() } // /// Converts into the bit patterns of these doubles (`[u64;2]`). /// /// Like [`f64::to_bits`](f64::to_bits), but both lanes at once. #[must_use] #[inline(always)] pub fn to_bits(self) -> [u64; 2] { unsafe { core::mem::transmute(self) } } /// Converts from the bit patterns of these doubles (`[u64;2]`). /// /// Like [`f64::from_bits`](f64::from_bits), but both lanes at once. #[must_use] #[inline(always)] pub fn from_bits(bits: [u64; 2]) -> Self { unsafe { core::mem::transmute(bits) } } } impl Clone for m128d { #[must_use] #[inline(always)] fn clone(&self) -> Self { *self } } impl Copy for m128d {} impl Default for m128d { #[must_use] #[inline(always)] fn default() -> Self { unsafe { core::mem::zeroed() } } } impl From<[f64; 2]> for m128d { #[must_use] #[inline(always)] fn from(arr: [f64; 2]) -> Self { // Safety: because this semantically moves the value from the input position // (align8) to the output position (align16) it is fine to increase our // required alignment without worry. unsafe { core::mem::transmute(arr) } } } impl From for [f64; 2] { #[must_use] #[inline(always)] fn from(m: m128d) -> Self { // We can of course transmute to a lower alignment unsafe { core::mem::transmute(m) } } } // // PLEASE KEEP ALL THE FORMAT IMPL JUNK AT THE END OF THE FILE // impl Debug for m128d { /// Debug formats each double. /// ``` /// # use safe_arch::*; /// let f = format!("{:?}", m128d::default()); /// assert_eq!(&f, "m128d(0.0, 0.0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "m128d(")?; Debug::fmt(&a[0], f)?; write!(f, ", ")?; Debug::fmt(&a[1], f)?; write!(f, ")") } } impl Display for m128d { /// Display formats each double, and leaves the type name off of the font. /// ``` /// # use safe_arch::*; /// let f = format!("{}", m128d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; Display::fmt(&a[0], f)?; write!(f, ", ")?; Display::fmt(&a[1], f)?; write!(f, ")") } } impl Binary for m128d { /// Binary formats each double's bit pattern (via [`f64::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:b}", m128d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; Binary::fmt(&a[0].to_bits(), f)?; write!(f, ", ")?; Binary::fmt(&a[1].to_bits(), f)?; write!(f, ")") } } impl LowerExp for m128d { /// LowerExp formats each double. /// ``` /// # use safe_arch::*; /// let f = format!("{:e}", m128d::default()); /// assert_eq!(&f, "(0e0, 0e0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; LowerExp::fmt(&a[0], f)?; write!(f, ", ")?; LowerExp::fmt(&a[1], f)?; write!(f, ")") } } impl UpperExp for m128d { /// UpperExp formats each double. /// ``` /// # use safe_arch::*; /// let f = format!("{:E}", m128d::default()); /// assert_eq!(&f, "(0E0, 0E0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; UpperExp::fmt(&a[0], f)?; write!(f, ", ")?; UpperExp::fmt(&a[1], f)?; write!(f, ")") } } impl LowerHex for m128d { /// LowerHex formats each double's bit pattern (via [`f64::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:x}", m128d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; LowerHex::fmt(&a[0].to_bits(), f)?; write!(f, ", ")?; LowerHex::fmt(&a[1].to_bits(), f)?; write!(f, ")") } } impl UpperHex for m128d { /// UpperHex formats each double's bit pattern (via [`f64::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:X}", m128d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; UpperHex::fmt(&a[0].to_bits(), f)?; write!(f, ", ")?; UpperHex::fmt(&a[1].to_bits(), f)?; write!(f, ")") } } impl Octal for m128d { /// Octal formats each double's bit pattern (via [`f64::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:o}", m128d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; Debug::fmt(&a[0].to_bits(), f)?; write!(f, ", ")?; Debug::fmt(&a[1].to_bits(), f)?; write!(f, ")") } } safe_arch-0.7.1/src/x86_x64/m128i_.rs000066400000000000000000000174671445526200400167350ustar00rootroot00000000000000//! This module is for the `m128i` wrapper type, its bonus methods, and all //! necessary trait impls. //! //! Intrinsics should _not_ be in this module! They should all be free-functions //! in the other modules, sorted by CPU target feature. use super::*; /// The data for a 128-bit SSE register of integer data. /// /// * The exact layout to view the type as depends on the operation used. /// * `From` and `Into` impls are provided for all the relevant signed integer /// array types. /// * Formatting impls print as four `i32` values just because they have to pick /// something. If you want an alternative you can turn it into an array and /// print as you like. #[repr(transparent)] #[allow(non_camel_case_types)] pub struct m128i(pub __m128i); #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Zeroable for m128i {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Pod for m128i {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::TransparentWrapper<__m128i> for m128i {} impl Clone for m128i { #[must_use] #[inline(always)] fn clone(&self) -> Self { *self } } impl Copy for m128i {} impl Default for m128i { #[must_use] #[inline(always)] fn default() -> Self { unsafe { core::mem::zeroed() } } } // 8-bit impl From<[i8; 16]> for m128i { #[must_use] #[inline(always)] fn from(arr: [i8; 16]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [i8; 16] { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } impl From<[u8; 16]> for m128i { #[must_use] #[inline(always)] fn from(arr: [u8; 16]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [u8; 16] { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } // 16-bit impl From<[i16; 8]> for m128i { #[must_use] #[inline(always)] fn from(arr: [i16; 8]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [i16; 8] { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } impl From<[u16; 8]> for m128i { #[must_use] #[inline(always)] fn from(arr: [u16; 8]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [u16; 8] { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } // 32-bit impl From<[i32; 4]> for m128i { #[must_use] #[inline(always)] fn from(arr: [i32; 4]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [i32; 4] { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } impl From<[u32; 4]> for m128i { #[must_use] #[inline(always)] fn from(arr: [u32; 4]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [u32; 4] { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } // 64-bit impl From<[i64; 2]> for m128i { #[must_use] #[inline(always)] fn from(arr: [i64; 2]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [i64; 2] { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } impl From<[u64; 2]> for m128i { #[must_use] #[inline(always)] fn from(arr: [u64; 2]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [u64; 2] { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } // 128-bit impl From for m128i { #[must_use] #[inline(always)] fn from(i: i128) -> Self { unsafe { core::mem::transmute(i) } } } impl From for i128 { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } impl From for m128i { #[must_use] #[inline(always)] fn from(u: u128) -> Self { unsafe { core::mem::transmute(u) } } } impl From for u128 { #[must_use] #[inline(always)] fn from(m: m128i) -> Self { unsafe { core::mem::transmute(m) } } } // // PLEASE KEEP ALL THE FORMAT IMPL JUNK AT THE END OF THE FILE // impl Debug for m128i { /// Debug formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:?}", m128i::default()); /// assert_eq!(&f, "m128i(0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "m128i(")?; for (i, int) in <[i32; 4]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } Debug::fmt(int, f)?; } write!(f, ")") } } impl Display for m128i { /// Display formats each `i32`, and leaves the type name off of the font. /// ``` /// # use safe_arch::*; /// let f = format!("{}", m128i::default()); /// assert_eq!(&f, "(0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 4]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } Display::fmt(int, f)?; } write!(f, ")") } } impl Binary for m128i { /// Binary formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:b}", m128i::default()); /// assert_eq!(&f, "(0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 4]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } Binary::fmt(int, f)?; } write!(f, ")") } } impl LowerExp for m128i { /// LowerExp formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:e}", m128i::default()); /// assert_eq!(&f, "(0e0, 0e0, 0e0, 0e0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 4]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } LowerExp::fmt(int, f)?; } write!(f, ")") } } impl UpperExp for m128i { /// UpperExp formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:E}", m128i::default()); /// assert_eq!(&f, "(0E0, 0E0, 0E0, 0E0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 4]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } UpperExp::fmt(int, f)?; } write!(f, ")") } } impl LowerHex for m128i { /// LowerHex formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:x}", m128i::default()); /// assert_eq!(&f, "(0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 4]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } LowerHex::fmt(int, f)?; } write!(f, ")") } } impl UpperHex for m128i { /// UpperHex formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:X}", m128i::default()); /// assert_eq!(&f, "(0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 4]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } UpperHex::fmt(int, f)?; } write!(f, ")") } } impl Octal for m128i { /// Octal formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:o}", m128i::default()); /// assert_eq!(&f, "(0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 4]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } Octal::fmt(int, f)?; } write!(f, ")") } } safe_arch-0.7.1/src/x86_x64/m256_.rs000066400000000000000000000150671445526200400165600ustar00rootroot00000000000000//! This module is for the `m256` wrapper type, its bonus methods, and all //! necessary trait impls. //! //! Intrinsics should _not_ be in this module! They should all be free-functions //! in the other modules, sorted by CPU target feature. use super::*; /// The data for a 256-bit AVX register of eight `f32` lanes. /// /// * This is _very similar to_ having `[f32; 8]`. The main difference is that /// it's aligned to 32 instead of just 4, and of course you can perform /// various intrinsic operations on it. #[repr(transparent)] #[allow(non_camel_case_types)] pub struct m256(pub __m256); #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Zeroable for m256 {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Pod for m256 {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::TransparentWrapper<__m256> for m256 {} impl m256 { /// Transmutes the `m256` to an array. /// /// Same as `m.into()`, just lets you be more explicit about what's happening. #[must_use] #[inline(always)] pub fn to_array(self) -> [f32; 8] { self.into() } /// Transmutes an array into `m256`. /// /// Same as `m256::from(arr)`, it just lets you be more explicit about what's /// happening. #[must_use] #[inline(always)] pub fn from_array(f: [f32; 8]) -> Self { f.into() } /// Converts into the bit patterns of these floats (`[u32;8]`). /// /// Like [`f32::to_bits`](f32::to_bits), but all eight lanes at once. #[must_use] #[inline(always)] pub fn to_bits(self) -> [u32; 8] { unsafe { core::mem::transmute(self) } } /// Converts from the bit patterns of these floats (`[u32;8]`). /// /// Like [`f32::from_bits`](f32::from_bits), but all eight lanes at once. #[must_use] #[inline(always)] pub fn from_bits(bits: [u32; 8]) -> Self { unsafe { core::mem::transmute(bits) } } } impl Clone for m256 { #[must_use] #[inline(always)] fn clone(&self) -> Self { *self } } impl Copy for m256 {} impl Default for m256 { #[must_use] #[inline(always)] fn default() -> Self { unsafe { core::mem::zeroed() } } } impl From<[f32; 8]> for m256 { #[must_use] #[inline(always)] fn from(arr: [f32; 8]) -> Self { // Safety: because this semantically moves the value from the input position // (align4) to the output position (align16) it is fine to increase our // required alignment without worry. unsafe { core::mem::transmute(arr) } } } impl From for [f32; 8] { #[must_use] #[inline(always)] fn from(m: m256) -> Self { // We can of course transmute to a lower alignment unsafe { core::mem::transmute(m) } } } // // PLEASE KEEP ALL THE FORMAT IMPL JUNK AT THE END OF THE FILE // impl Debug for m256 { /// Debug formats each float. /// ``` /// # use safe_arch::*; /// let f = format!("{:?}", m256::default()); /// assert_eq!(&f, "m256(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "m256(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } Debug::fmt(float, f)?; } write!(f, ")") } } impl Display for m256 { /// Display formats each float, and leaves the type name off of the font. /// ``` /// # use safe_arch::*; /// let f = format!("{}", m256::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } Display::fmt(float, f)?; } write!(f, ")") } } impl Binary for m256 { /// Binary formats each float's bit pattern (via [`f32::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:b}", m256::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } Binary::fmt(&float.to_bits(), f)?; } write!(f, ")") } } impl LowerExp for m256 { /// LowerExp formats each float. /// ``` /// # use safe_arch::*; /// let f = format!("{:e}", m256::default()); /// assert_eq!(&f, "(0e0, 0e0, 0e0, 0e0, 0e0, 0e0, 0e0, 0e0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } LowerExp::fmt(float, f)?; } write!(f, ")") } } impl UpperExp for m256 { /// UpperExp formats each float. /// ``` /// # use safe_arch::*; /// let f = format!("{:E}", m256::default()); /// assert_eq!(&f, "(0E0, 0E0, 0E0, 0E0, 0E0, 0E0, 0E0, 0E0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } UpperExp::fmt(float, f)?; } write!(f, ")") } } impl LowerHex for m256 { /// LowerHex formats each float's bit pattern (via [`f32::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:x}", m256::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } LowerHex::fmt(&float.to_bits(), f)?; } write!(f, ")") } } impl UpperHex for m256 { /// UpperHex formats each float's bit pattern (via [`f32::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:X}", m256::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } UpperHex::fmt(&float.to_bits(), f)?; } write!(f, ")") } } impl Octal for m256 { /// Octal formats each float's bit pattern (via [`f32::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:o}", m256::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, float) in self.to_array().iter().enumerate() { if i != 0 { write!(f, ", ")?; } Octal::fmt(&float.to_bits(), f)?; } write!(f, ")") } } safe_arch-0.7.1/src/x86_x64/m256d_.rs000066400000000000000000000142271445526200400167210ustar00rootroot00000000000000//! This module is for the `m256d` wrapper type, its bonus methods, and all //! necessary trait impls. //! //! Intrinsics should _not_ be in this module! They should all be free-functions //! in the other modules, sorted by CPU target feature. use super::*; /// The data for a 256-bit AVX register of four `f64` values. /// /// * This is _very similar to_ having `[f64; 4]`. The main difference is that /// it's aligned to 32 instead of just 4, and of course you can perform /// various intrinsic operations on it. #[repr(transparent)] #[allow(non_camel_case_types)] pub struct m256d(pub __m256d); #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Zeroable for m256d {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Pod for m256d {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::TransparentWrapper<__m256d> for m256d {} impl m256d { /// Transmutes the `m256d` to an array. /// /// Same as `m.into()`, just lets you be more explicit about what's happening. #[must_use] #[inline(always)] pub fn to_array(self) -> [f64; 4] { self.into() } /// Transmutes an array into `m256d`. /// /// Same as `m256d::from(arr)`, it just lets you be more explicit about what's /// happening. #[must_use] #[inline(always)] pub fn from_array(f: [f64; 4]) -> Self { f.into() } // /// Converts into the bit patterns of these doubles (`[u64;4]`). /// /// Like [`f64::to_bits`](f64::to_bits), but both lanes at once. #[must_use] #[inline(always)] pub fn to_bits(self) -> [u64; 4] { unsafe { core::mem::transmute(self) } } /// Converts from the bit patterns of these doubles (`[u64;4]`). /// /// Like [`f64::from_bits`](f64::from_bits), but both lanes at once. #[must_use] #[inline(always)] pub fn from_bits(bits: [u64; 4]) -> Self { unsafe { core::mem::transmute(bits) } } } impl Clone for m256d { #[must_use] #[inline(always)] fn clone(&self) -> Self { *self } } impl Copy for m256d {} impl Default for m256d { #[must_use] #[inline(always)] fn default() -> Self { unsafe { core::mem::zeroed() } } } impl From<[f64; 4]> for m256d { #[must_use] #[inline(always)] fn from(arr: [f64; 4]) -> Self { // Safety: because this semantically moves the value from the input position // (align8) to the output position (align16) it is fine to increase our // required alignment without worry. unsafe { core::mem::transmute(arr) } } } impl From for [f64; 4] { #[must_use] #[inline(always)] fn from(m: m256d) -> Self { // We can of course transmute to a lower alignment unsafe { core::mem::transmute(m) } } } // // PLEASE KEEP ALL THE FORMAT IMPL JUNK AT THE END OF THE FILE // impl Debug for m256d { /// Debug formats each double. /// ``` /// # use safe_arch::*; /// let f = format!("{:?}", m256d::default()); /// assert_eq!(&f, "m256d(0.0, 0.0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "m256d(")?; Debug::fmt(&a[0], f)?; write!(f, ", ")?; Debug::fmt(&a[1], f)?; write!(f, ")") } } impl Display for m256d { /// Display formats each double, and leaves the type name off of the font. /// ``` /// # use safe_arch::*; /// let f = format!("{}", m256d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; Display::fmt(&a[0], f)?; write!(f, ", ")?; Display::fmt(&a[1], f)?; write!(f, ")") } } impl Binary for m256d { /// Binary formats each double's bit pattern (via [`f64::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:b}", m256d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; Binary::fmt(&a[0].to_bits(), f)?; write!(f, ", ")?; Binary::fmt(&a[1].to_bits(), f)?; write!(f, ")") } } impl LowerExp for m256d { /// LowerExp formats each double. /// ``` /// # use safe_arch::*; /// let f = format!("{:e}", m256d::default()); /// assert_eq!(&f, "(0e0, 0e0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; LowerExp::fmt(&a[0], f)?; write!(f, ", ")?; LowerExp::fmt(&a[1], f)?; write!(f, ")") } } impl UpperExp for m256d { /// UpperExp formats each double. /// ``` /// # use safe_arch::*; /// let f = format!("{:E}", m256d::default()); /// assert_eq!(&f, "(0E0, 0E0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; UpperExp::fmt(&a[0], f)?; write!(f, ", ")?; UpperExp::fmt(&a[1], f)?; write!(f, ")") } } impl LowerHex for m256d { /// LowerHex formats each double's bit pattern (via [`f64::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:x}", m256d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; LowerHex::fmt(&a[0].to_bits(), f)?; write!(f, ", ")?; LowerHex::fmt(&a[1].to_bits(), f)?; write!(f, ")") } } impl UpperHex for m256d { /// UpperHex formats each double's bit pattern (via [`f64::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:X}", m256d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; UpperHex::fmt(&a[0].to_bits(), f)?; write!(f, ", ")?; UpperHex::fmt(&a[1].to_bits(), f)?; write!(f, ")") } } impl Octal for m256d { /// Octal formats each double's bit pattern (via [`f64::to_bits`]). /// ``` /// # use safe_arch::*; /// let f = format!("{:o}", m256d::default()); /// assert_eq!(&f, "(0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let a = self.to_array(); write!(f, "(")?; Debug::fmt(&a[0].to_bits(), f)?; write!(f, ", ")?; Debug::fmt(&a[1].to_bits(), f)?; write!(f, ")") } } safe_arch-0.7.1/src/x86_x64/m256i_.rs000066400000000000000000000177131445526200400167310ustar00rootroot00000000000000//! This module is for the `m256i` wrapper type, its bonus methods, and all //! necessary trait impls. //! //! Intrinsics should _not_ be in this module! They should all be free-functions //! in the other modules, sorted by CPU target feature. use super::*; /// The data for a 256-bit AVX register of integer data. /// /// * The exact layout to view the type as depends on the operation used. /// * `From` and `Into` impls are provided for all the relevant signed integer /// array types. /// * Formatting impls print as four `i32` values just because they have to pick /// something. If you want an alternative you can turn it into an array and /// print as you like. #[repr(transparent)] #[allow(non_camel_case_types)] pub struct m256i(pub __m256i); #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Zeroable for m256i {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::Pod for m256i {} #[cfg(feature = "bytemuck")] unsafe impl bytemuck::TransparentWrapper<__m256i> for m256i {} impl Clone for m256i { #[must_use] #[inline(always)] fn clone(&self) -> Self { *self } } impl Copy for m256i {} impl Default for m256i { #[must_use] #[inline(always)] fn default() -> Self { unsafe { core::mem::zeroed() } } } // 8-bit impl From<[i8; 32]> for m256i { #[must_use] #[inline(always)] fn from(arr: [i8; 32]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [i8; 32] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } impl From<[u8; 32]> for m256i { #[must_use] #[inline(always)] fn from(arr: [u8; 32]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [u8; 32] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } // 16-bit impl From<[i16; 16]> for m256i { #[must_use] #[inline(always)] fn from(arr: [i16; 16]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [i16; 16] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } impl From<[u16; 16]> for m256i { #[must_use] #[inline(always)] fn from(arr: [u16; 16]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [u16; 16] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } // 32-bit impl From<[i32; 8]> for m256i { #[must_use] #[inline(always)] fn from(arr: [i32; 8]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [i32; 8] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } impl From<[u32; 8]> for m256i { #[must_use] #[inline(always)] fn from(arr: [u32; 8]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [u32; 8] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } // 64-bit impl From<[i64; 4]> for m256i { #[must_use] #[inline(always)] fn from(arr: [i64; 4]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [i64; 4] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } impl From<[u64; 4]> for m256i { #[must_use] #[inline(always)] fn from(arr: [u64; 4]) -> Self { unsafe { core::mem::transmute(arr) } } } impl From for [u64; 4] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } // 256-bit impl From<[i128; 2]> for m256i { #[must_use] #[inline(always)] fn from(i: [i128; 2]) -> Self { unsafe { core::mem::transmute(i) } } } impl From for [i128; 2] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } impl From<[u128; 2]> for m256i { #[must_use] #[inline(always)] fn from(u: [u128; 2]) -> Self { unsafe { core::mem::transmute(u) } } } impl From for [u128; 2] { #[must_use] #[inline(always)] fn from(m: m256i) -> Self { unsafe { core::mem::transmute(m) } } } // // PLEASE KEEP ALL THE FORMAT IMPL JUNK AT THE END OF THE FILE // impl Debug for m256i { /// Debug formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:?}", m256i::default()); /// assert_eq!(&f, "m256i(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "m256i(")?; for (i, int) in <[i32; 8]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } Debug::fmt(int, f)?; } write!(f, ")") } } impl Display for m256i { /// Display formats each `i32`, and leaves the type name off of the font. /// ``` /// # use safe_arch::*; /// let f = format!("{}", m256i::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 8]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } Display::fmt(int, f)?; } write!(f, ")") } } impl Binary for m256i { /// Binary formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:b}", m256i::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 8]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } Binary::fmt(int, f)?; } write!(f, ")") } } impl LowerExp for m256i { /// LowerExp formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:e}", m256i::default()); /// assert_eq!(&f, "(0e0, 0e0, 0e0, 0e0, 0e0, 0e0, 0e0, 0e0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 8]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } LowerExp::fmt(int, f)?; } write!(f, ")") } } impl UpperExp for m256i { /// UpperExp formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:E}", m256i::default()); /// assert_eq!(&f, "(0E0, 0E0, 0E0, 0E0, 0E0, 0E0, 0E0, 0E0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 8]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } UpperExp::fmt(int, f)?; } write!(f, ")") } } impl LowerHex for m256i { /// LowerHex formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:x}", m256i::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 8]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } LowerHex::fmt(int, f)?; } write!(f, ")") } } impl UpperHex for m256i { /// UpperHex formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:X}", m256i::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 8]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } UpperHex::fmt(int, f)?; } write!(f, ")") } } impl Octal for m256i { /// Octal formats each `i32`. /// ``` /// # use safe_arch::*; /// let f = format!("{:o}", m256i::default()); /// assert_eq!(&f, "(0, 0, 0, 0, 0, 0, 0, 0)"); /// ``` fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "(")?; for (i, int) in <[i32; 8]>::from(*self).iter().enumerate() { if i != 0 { write!(f, ", ")?; } Octal::fmt(int, f)?; } write!(f, ")") } } safe_arch-0.7.1/src/x86_x64/pclmulqdq.rs000066400000000000000000000011351445526200400177210ustar00rootroot00000000000000#![cfg(target_feature = "pclmulqdq")] use super::*; /// Performs a "carryless" multiplication of two `i64` values. /// /// The `IMM` value selects which lanes of `a` and `b` are multiplied. /// * Bit 0: the `i64` index from `a` to multiply. /// * Bit 4: the `i64` index from `b` to multiply. /// /// The output is always in the low `i64` lane, with the high lane as 0. /// /// * **Intrinsic:** [`_mm_clmulepi64_si128`] /// * **Assembly:** `pclmulqdq xmm, xmm, imm8` pub fn mul_i64_carryless_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_clmulepi64_si128(a.0, b.0, IMM) }) } safe_arch-0.7.1/src/x86_x64/popcnt.rs000066400000000000000000000012221445526200400172170ustar00rootroot00000000000000#![cfg(target_feature = "popcnt")] use super::*; /// Count the number of bits set within an `i32` /// /// * **Intrinsic:** [`_popcnt32`] /// * **Assembly:** `popcnt r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "popcnt")))] pub fn population_count_i32(a: i32) -> i32 { unsafe { _popcnt32(a) } } /// Count the number of bits set within an `i64` /// /// * **Intrinsic:** [`_popcnt64`] /// * **Assembly:** `popcnt r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "popcnt")))] pub fn population_count_i64(a: i64) -> i32 { unsafe { _popcnt64(a) } } safe_arch-0.7.1/src/x86_x64/rdrand.rs000066400000000000000000000017231445526200400171740ustar00rootroot00000000000000#![cfg(target_feature = "rdrand")] use super::*; /// Try to obtain a random `u16` from the hardware RNG. /// /// * **Intrinsic:** [`_rdrand16_step`] /// * **Assembly:** `rdrand r16` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "rdrand")))] pub fn rdrand_u16(out: &mut u16) -> i32 { unsafe { _rdrand16_step(out) } } /// Try to obtain a random `u32` from the hardware RNG. /// /// * **Intrinsic:** [`_rdrand32_step`] /// * **Assembly:** `rdrand r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "rdrand")))] pub fn rdrand_u32(out: &mut u32) -> i32 { unsafe { _rdrand32_step(out) } } /// Try to obtain a random `u64` from the hardware RNG. /// /// * **Intrinsic:** [`_rdrand64_step`] /// * **Assembly:** `rdrand r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "rdrand")))] pub fn rdrand_u64(out: &mut u64) -> i32 { unsafe { _rdrand64_step(out) } } safe_arch-0.7.1/src/x86_x64/rdseed.rs000066400000000000000000000017231445526200400171700ustar00rootroot00000000000000#![cfg(target_feature = "rdseed")] use super::*; /// Try to obtain a random `u16` from the hardware RNG. /// /// * **Intrinsic:** [`_rdseed16_step`] /// * **Assembly:** `rdseed r16` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "rdseed")))] pub fn rdseed_u16(out: &mut u16) -> i32 { unsafe { _rdseed16_step(out) } } /// Try to obtain a random `u32` from the hardware RNG. /// /// * **Intrinsic:** [`_rdseed32_step`] /// * **Assembly:** `rdseed r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "rdseed")))] pub fn rdseed_u32(out: &mut u32) -> i32 { unsafe { _rdseed32_step(out) } } /// Try to obtain a random `u64` from the hardware RNG. /// /// * **Intrinsic:** [`_rdseed64_step`] /// * **Assembly:** `rdseed r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "rdseed")))] pub fn rdseed_u64(out: &mut u64) -> i32 { unsafe { _rdseed64_step(out) } } safe_arch-0.7.1/src/x86_x64/sse.rs000066400000000000000000001241161445526200400165160ustar00rootroot00000000000000#![cfg(target_feature = "sse")] use super::*; /// Fetches the cache line containing `addr` into all levels of the cache hierarchy. #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn prefetch_t0(addr: &T) { unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_T0) } } /// Fetches into L2 and higher. #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn prefetch_t1(addr: &T) { unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_T1) } } /// Fetches into L3 and higher or an implementation-specific choice (e.g., L2 if there is no L3). #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn prefetch_t2(addr: &T) { unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_T2) } } /// Fetch data using the /// non-temporal access (NTA) hint. It may be a place closer than main memory /// but outside of the cache hierarchy. This is used to reduce access latency /// without polluting the cache. #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn prefetch_nta(addr: &T) { unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_NTA) } } /// Fetches the cache line containing `addr` into all levels of the cache hierarchy, anticipating write #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn prefetch_et0(addr: &T) { unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_ET0) } } /// Fetches into L2 and higher, anticipating write #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn prefetch_et1(addr: &T) { unsafe { _mm_prefetch(addr as *const T as *const i8, _MM_HINT_ET1) } } /// Lanewise `a + b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]); /// let c = add_m128(a, b).to_array(); /// assert_eq!(c, [6.0, 8.0, 10.0, 12.5]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn add_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_add_ps(a.0, b.0) }) } /// Low lane `a + b`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]); /// let c = add_m128_s(a, b).to_array(); /// assert_eq!(c, [6.0, 2.0, 3.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn add_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_add_ss(a.0, b.0) }) } /// Bitwise `a & b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]); /// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]); /// let c = bitand_m128(a, b).to_array(); /// assert_eq!(c, [1.0, 0.0, 0.0, 0.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn bitand_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_and_ps(a.0, b.0) }) } /// Bitwise `(!a) & b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]); /// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]); /// let c = bitandnot_m128(a, b).to_array(); /// assert_eq!(c, [0.0, 1.0, 0.0, 0.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn bitandnot_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_andnot_ps(a.0, b.0) }) } /// Lanewise `a == b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]); /// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]); /// let c = cmp_eq_mask_m128(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 0, 0, u32::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_eq_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpeq_ps(a.0, b.0) }) } /// Low lane `a == b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]); /// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]); /// let c = cmp_eq_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 0, 1_f32.to_bits(), 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_eq_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpeq_ss(a.0, b.0) }) } /// Lanewise `a >= b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_ge_mask_m128(a, b).to_bits(); /// assert_eq!(c, [0, u32::MAX, u32::MAX, u32::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_ge_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpge_ps(a.0, b.0) }) } /// Low lane `a >= b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_ge_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_ge_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpge_ss(a.0, b.0) }) } /// Lanewise `a > b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_gt_mask_m128(a, b).to_bits(); /// assert_eq!(c, [0, 0, u32::MAX, u32::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_gt_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpgt_ps(a.0, b.0) }) } /// Low lane `a > b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.5, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_gt_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_gt_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpgt_ss(a.0, b.0) }) } /// Lanewise `a <= b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_le_mask_m128(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, u32::MAX, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_le_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmple_ps(a.0, b.0) }) } /// Low lane `a <= b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_le_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_le_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmple_ss(a.0, b.0) }) } /// Lanewise `a < b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_lt_mask_m128(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 0, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_lt_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmplt_ps(a.0, b.0) }) } /// Low lane `a < b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_lt_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_lt_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmplt_ss(a.0, b.0) }) } /// Lanewise `a != b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]); /// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]); /// let c = cmp_neq_mask_m128(a, b).to_bits(); /// assert_eq!(c, [0, u32::MAX, u32::MAX, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_neq_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpneq_ps(a.0, b.0) }) } /// Low lane `a != b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]); /// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]); /// let c = cmp_neq_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [0, 0, 1_f32.to_bits(), 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_neq_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpneq_ss(a.0, b.0) }) } /// Lanewise `!(a >= b)`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_nge_mask_m128(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 0, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_nge_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpnge_ps(a.0, b.0) }) } /// Low lane `!(a >= b)`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_nge_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_nge_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpnge_ss(a.0, b.0) }) } /// Lanewise `!(a > b)`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_ngt_mask_m128(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, u32::MAX, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_ngt_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpngt_ps(a.0, b.0) }) } /// Low lane `!(a > b)`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.5, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_ngt_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_ngt_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpngt_ss(a.0, b.0) }) } /// Lanewise `!(a <= b)`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_nle_mask_m128(a, b).to_bits(); /// assert_eq!(c, [0, 0, u32::MAX, u32::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_nle_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpnle_ps(a.0, b.0) }) } /// Low lane `!(a <= b)`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_nle_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_nle_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpnle_ss(a.0, b.0) }) } /// Lanewise `!(a < b)`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_nlt_mask_m128(a, b).to_bits(); /// assert_eq!(c, [0, u32::MAX, u32::MAX, u32::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_nlt_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpnlt_ps(a.0, b.0) }) } /// Low lane `!(a < b)`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([2.0, 2.0, 2.0, 2.0]); /// let c = cmp_nlt_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_nlt_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpnlt_ss(a.0, b.0) }) } /// Lanewise `(!a.is_nan()) & (!b.is_nan())`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([0.0, f32::NAN, 0.0, f32::NAN]); /// let b = m128::from_array([0.0, 0.0, f32::NAN, f32::NAN]); /// let c = cmp_ordered_mask_m128(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 0, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_ordered_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpord_ps(a.0, b.0) }) } /// Low lane `(!a.is_nan()) & (!b.is_nan())`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([0.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([0.0, f32::NAN, f32::NAN, f32::NAN]); /// let c = cmp_ordered_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [u32::MAX, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_ordered_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpord_ss(a.0, b.0) }) } /// Lanewise `a.is_nan() | b.is_nan()`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([0.0, f32::NAN, 0.0, f32::NAN]); /// let b = m128::from_array([0.0, 0.0, f32::NAN, f32::NAN]); /// let c = cmp_unord_mask_m128(a, b).to_bits(); /// assert_eq!(c, [0, u32::MAX, u32::MAX, u32::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_unord_mask_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpunord_ps(a.0, b.0) }) } /// Low lane `a.is_nan() | b.is_nan()`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([0.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([0.0, f32::NAN, f32::NAN, f32::NAN]); /// let c = cmp_unord_mask_m128_s(a, b).to_bits(); /// assert_eq!(c, [0, 2_f32.to_bits(), 3_f32.to_bits(), 4_f32.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_unord_mask_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_cmpunord_ss(a.0, b.0) }) } /// Low lane equality. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// assert_eq!(1_i32, cmp_eq_i32_m128_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_eq_i32_m128_s(a: m128, b: m128) -> i32 { unsafe { _mm_comieq_ss(a.0, b.0) } } /// Low lane greater than or equal to. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// assert_eq!(1_i32, cmp_ge_i32_m128_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_ge_i32_m128_s(a: m128, b: m128) -> i32 { unsafe { _mm_comige_ss(a.0, b.0) } } /// Low lane greater than. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([2.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// assert_eq!(1_i32, cmp_gt_i32_m128_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_gt_i32_m128_s(a: m128, b: m128) -> i32 { unsafe { _mm_comigt_ss(a.0, b.0) } } /// Low lane less than or equal to. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([0.5, 2.0, 3.0, 4.0]); /// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// assert_eq!(1_i32, cmp_le_i32_m128_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_le_i32_m128_s(a: m128, b: m128) -> i32 { unsafe { _mm_comile_ss(a.0, b.0) } } /// Low lane less than. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([0.5, 2.0, 3.0, 4.0]); /// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// assert_eq!(1_i32, cmp_lt_i32_m128_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_lt_i32_m128_s(a: m128, b: m128) -> i32 { unsafe { _mm_comilt_ss(a.0, b.0) } } /// Low lane not equal to. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]); /// assert_eq!(0_i32, cmp_neq_i32_m128_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn cmp_neq_i32_m128_s(a: m128, b: m128) -> i32 { unsafe { _mm_comineq_ss(a.0, b.0) } } /// Convert `i32` to `f32` and replace the low lane of the input. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = convert_i32_replace_m128_s(a, 5_i32).to_array(); /// assert_eq!(b, [5.0, 2.0, 3.0, 4.0]); /// ``` /// * **Intrinsic:** [`_mm_cvtsi32_ss`] /// * **Assembly:** `cvtsi2ss xmm, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn convert_i32_replace_m128_s(a: m128, i: i32) -> m128 { m128(unsafe { _mm_cvtsi32_ss(a.0, i) }) } /// Convert `i64` to `f32` and replace the low lane of the input. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = convert_i64_replace_m128_s(a, 5_i64).to_array(); /// assert_eq!(b, [5.0, 2.0, 3.0, 4.0]); /// ``` /// * **Intrinsic:** [`_mm_cvtsi64_ss`] /// * **Assembly:** `cvtsi2ss xmm, r64` #[must_use] #[inline(always)] #[cfg(arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn convert_i64_replace_m128_s(a: m128, i: i64) -> m128 { m128(unsafe { _mm_cvtsi64_ss(a.0, i) }) } /// Gets the low lane as an individual `f32` value. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// assert_eq!(1_f32, get_f32_from_m128_s(a)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn get_f32_from_m128_s(a: m128) -> f32 { unsafe { _mm_cvtss_f32(a.0) } } /// Converts the low lane to `i32` and extracts as an individual value. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// assert_eq!(1_i32, get_i32_from_m128_s(a)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn get_i32_from_m128_s(a: m128) -> i32 { unsafe { _mm_cvtss_si32(a.0) } } /// Converts the low lane to `i64` and extracts as an individual value. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// assert_eq!(1_i64, get_i64_from_m128_s(a)); /// ``` #[must_use] #[inline(always)] #[cfg(arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn get_i64_from_m128_s(a: m128) -> i64 { unsafe { _mm_cvttss_si64(a.0) } } /// Lanewise `a / b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]); /// let b = m128::from_array([2.0, 6.0, 13.0, 2.0]); /// let c = div_m128(a, b).to_array(); /// assert_eq!(c, [5.0, 2.0, 1.0, 7.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn div_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_div_ps(a.0, b.0) }) } /// Low lane `a / b`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]); /// let b = m128::from_array([2.0, 6.0, 13.0, 2.0]); /// let c = div_m128_s(a, b).to_array(); /// assert_eq!(c, [5.0, 12.0, 13.0, 14.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn div_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_div_ss(a.0, b.0) }) } /// Loads the reference into a register. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]); /// let b = load_m128(&a); /// assert_eq!(a.to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn load_m128(a: &m128) -> m128 { m128(unsafe { _mm_load_ps(a as *const m128 as *const f32) }) } /// Loads the `f32` reference into all lanes of a register. /// ``` /// # use safe_arch::*; /// let a = 1.0; /// let b = load_f32_splat_m128(&a); /// assert_eq!(m128::from_array([1.0, 1.0, 1.0, 1.0]).to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[allow(clippy::trivially_copy_pass_by_ref)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn load_f32_splat_m128(a: &f32) -> m128 { // question: how is this different from _mm_broadcast_ss? m128(unsafe { _mm_load_ps1(a) }) } /// Loads the `f32` reference into the low lane of the register. /// ``` /// # use safe_arch::*; /// let a = 1.0; /// let b = load_f32_m128_s(&a); /// assert_eq!(m128::from_array([1.0, 0.0, 0.0, 0.0]).to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[allow(clippy::trivially_copy_pass_by_ref)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn load_f32_m128_s(a: &f32) -> m128 { m128(unsafe { _mm_load_ss(a) }) } /// Loads the reference into a register with reversed order. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]); /// let b = load_reverse_m128(&a); /// assert_eq!(m128::from_array([14.0, 13.0, 12.0, 10.0]).to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn load_reverse_m128(a: &m128) -> m128 { m128(unsafe { _mm_loadr_ps(a as *const m128 as *const f32) }) } /// Loads the reference into a register. /// /// This generally has no speed penalty if the reference happens to be 16-byte /// aligned, but there is a slight speed penalty if the reference is only 4-byte /// aligned. /// ``` /// # use safe_arch::*; /// let a = [10.0, 12.0, 13.0, 14.0]; /// let b = load_unaligned_m128(&a); /// assert_eq!(m128::from_array(a).to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn load_unaligned_m128(a: &[f32; 4]) -> m128 { m128(unsafe { _mm_loadu_ps(a as *const [f32; 4] as *const f32) }) } /// Lanewise `max(a, b)`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]); /// let c = max_m128(a, b).to_array(); /// assert_eq!(c, [5.0, 12.0, 7.0, 8.5]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn max_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_max_ps(a.0, b.0) }) } /// Low lane `max(a, b)`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]); /// let c = max_m128_s(a, b).to_array(); /// assert_eq!(c, [5.0, 12.0, 3.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn max_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_max_ss(a.0, b.0) }) } /// Lanewise `min(a, b)`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]); /// let c = min_m128(a, b).to_array(); /// assert_eq!(c, [1.0, 6.0, 3.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn min_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_min_ps(a.0, b.0) }) } /// Low lane `min(a, b)`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]); /// let b = m128::from_array([0.0, 6.0, 7.0, 8.5]); /// let c = min_m128_s(a, b).to_array(); /// assert_eq!(c, [0.0, 12.0, 3.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn min_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_min_ss(a.0, b.0) }) } /// Move the low lane of `b` to `a`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]); /// let b = m128::from_array([8.0, 6.0, 7.0, 8.5]); /// let c = move_m128_s(a, b).to_array(); /// assert_eq!(c, [8.0, 12.0, 3.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn move_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_move_ss(a.0, b.0) }) } /// Move the high lanes of `b` to the low lanes of `a`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]); /// let b = m128::from_array([8.0, 6.0, 7.0, 8.5]); /// let c = move_high_low_m128(a, b).to_array(); /// assert_eq!(c, [7.0, 8.5, 3.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn move_high_low_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_movehl_ps(a.0, b.0) }) } /// Move the low lanes of `b` to the high lanes of `a`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 12.0, 3.0, 4.0]); /// let b = m128::from_array([8.0, 6.0, 7.0, 8.5]); /// let c = move_low_high_m128(a, b).to_array(); /// assert_eq!(c, [1.0, 12.0, 8.0, 6.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn move_low_high_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_movelh_ps(a.0, b.0) }) } /// Gathers the sign bit of each lane. /// /// The output has lane 0 as bit 0, lane 1 as bit 1, and so on. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([-1.0, 12.0, -3.0, -4.0]); /// let i = move_mask_m128(a); /// assert_eq!(i, 0b1101); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn move_mask_m128(a: m128) -> i32 { unsafe { _mm_movemask_ps(a.0) } } /// Lanewise `a * b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]); /// let c = mul_m128(a, b).to_array(); /// assert_eq!(c, [5.0, 12.0, 21.0, 34.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn mul_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_mul_ps(a.0, b.0) }) } /// Low lane `a * b`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.5]); /// let c = mul_m128_s(a, b).to_array(); /// assert_eq!(c, [5.0, 2.0, 3.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn mul_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_mul_ss(a.0, b.0) }) } /// Bitwise `a | b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]); /// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]); /// let c = bitor_m128(a, b).to_array(); /// assert_eq!(c, [1.0, 1.0, 1.0, 0.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn bitor_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_or_ps(a.0, b.0) }) } /// Lanewise `1.0 / a` approximation. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 4.0, 8.0]); /// let b = reciprocal_m128(a).to_array(); /// let expected = [1.0, 0.5, 0.25, 0.125]; /// for i in 0..4 { /// assert!((b[i] - expected[i]).abs() < 0.001); /// } /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn reciprocal_m128(a: m128) -> m128 { m128(unsafe { _mm_rcp_ps(a.0) }) } /// Low lane `1.0 / a` approximation, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 4.0, 8.0]); /// let b = reciprocal_m128_s(a).to_array(); /// let expected = [1.0, 2.0, 4.0, 8.0]; /// for i in 0..4 { /// assert!((b[i] - expected[i]).abs() < 0.001); /// } /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn reciprocal_m128_s(a: m128) -> m128 { m128(unsafe { _mm_rcp_ss(a.0) }) } /// Lanewise `1.0 / sqrt(a)` approximation. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([16.0, 9.0, 4.0, 25.0]); /// let b = reciprocal_sqrt_m128(a).to_array(); /// let expected = [0.25, 0.33333, 0.5, 0.2]; /// for i in 0..4 { /// assert!((b[i] - expected[i]).abs() < 0.001); /// } /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn reciprocal_sqrt_m128(a: m128) -> m128 { m128(unsafe { _mm_rsqrt_ps(a.0) }) } /// Low lane `1.0 / sqrt(a)` approximation, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([16.0, 8.0, 9.0, 10.0]); /// let b = reciprocal_sqrt_m128_s(a).to_array(); /// let expected = [0.25, 8.0, 9.0, 10.0]; /// for i in 0..4 { /// assert!((b[i] - expected[i]).abs() < 0.001); /// } /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn reciprocal_sqrt_m128_s(a: m128) -> m128 { m128(unsafe { _mm_rsqrt_ss(a.0) }) } /// Sets the args into an `m128`, first arg is the high lane. /// ``` /// # use safe_arch::*; /// let a = set_m128(1.0, 2.0, 3.0, 4.0).to_array(); /// let b = m128::from_array([4.0, 3.0, 2.0, 1.0]).to_array(); /// assert_eq!(a, b); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn set_m128(three: f32, two: f32, one: f32, zero: f32) -> m128 { m128(unsafe { _mm_set_ps(three, two, one, zero) }) } /// Sets the args into an `m128`, first arg is the high lane. /// ``` /// # use safe_arch::*; /// let a = set_m128_s(1.0).to_array(); /// let b = m128::from_array([1.0, 0.0, 0.0, 0.0]).to_array(); /// assert_eq!(a, b); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn set_m128_s(low: f32) -> m128 { m128(unsafe { _mm_set_ss(low) }) } /// Splats the value to all lanes. /// ``` /// # use safe_arch::*; /// let a = set_splat_m128(1.0).to_array(); /// let b = m128::from_array([1.0, 1.0, 1.0, 1.0]).to_array(); /// assert_eq!(a, b); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn set_splat_m128(all: f32) -> m128 { m128(unsafe { _mm_set1_ps(all) }) } /// Sets the args into an `m128`, first arg is the low lane. /// ``` /// # use safe_arch::*; /// let a = set_reversed_m128(1.0, 2.0, 3.0, 4.0).to_array(); /// let b = m128::from_array([1.0, 2.0, 3.0, 4.0]).to_array(); /// assert_eq!(a, b); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn set_reversed_m128(zero: f32, one: f32, two: f32, three: f32) -> m128 { m128(unsafe { _mm_setr_ps(zero, one, two, three) }) } /// All lanes zero. /// ``` /// # use safe_arch::*; /// let a = zeroed_m128().to_array(); /// assert_eq!(a, [0.0, 0.0, 0.0, 0.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn zeroed_m128() -> m128 { m128(unsafe { _mm_setzero_ps() }) } /// Shuffle the `f32` lanes from `$a` and `$b` together using an immediate /// control value. /// /// The `a:` and `b:` prefixes on the index selection values are literal tokens /// that you type. It helps keep clear what value comes from where. The first /// two output lanes come from `$a`, the second two output lanes come from `$b`. /// /// You can pass the same value as both arguments, but if you want to swizzle /// within only a single register and you have `avx` available consider using /// [`shuffle_ai_f32_all_m128`] instead. You'll get much better performance. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.0]); /// // /// let c = shuffle_abi_f32_all_m128::<0>(a, b).to_array(); /// assert_eq!(c, [1.0, 1.0, 5.0, 5.0]); /// // /// let c = shuffle_abi_f32_all_m128::<0b11_10_01_00>(a, b).to_array(); /// assert_eq!(c, [1.0, 2.0, 7.0, 8.0]); /// // /// let c = shuffle_abi_f32_all_m128::<0b00_10_10_01>(a, b).to_array(); /// assert_eq!(c, [2.0, 3.0, 7.0, 5.0]); /// ``` /// * **Intrinsic:** [`_mm_shuffle_ps`] /// * **Assembly:** `shufps xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn shuffle_abi_f32_all_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_shuffle_ps(a.0, b.0, MASK) }) } /// Lanewise `sqrt(a)`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([25.0, 16.0, 9.0, 4.0]); /// let b = sqrt_m128(a).to_array(); /// assert_eq!(b, [5.0, 4.0, 3.0, 2.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn sqrt_m128(a: m128) -> m128 { m128(unsafe { _mm_sqrt_ps(a.0) }) } /// Low lane `sqrt(a)`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([4.0, 8.0, 7.0, 6.0]); /// let b = sqrt_m128_s(a).to_array(); /// assert_eq!(b, [2.0, 8.0, 7.0, 6.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn sqrt_m128_s(a: m128) -> m128 { m128(unsafe { _mm_sqrt_ss(a.0) }) } /// Stores the value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]); /// let mut b = zeroed_m128(); /// store_m128(&mut b, a); /// let c = b.to_array(); /// assert_eq!(c, [10.0, 12.0, 13.0, 14.0]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn store_m128(r: &mut m128, a: m128) { unsafe { _mm_store_ps(r as *mut m128 as *mut f32, a.0) } } /// Stores the low lane value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]); /// let mut f = 0.0; /// store_m128_s(&mut f, a); /// assert_eq!(f, 10.0); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn store_m128_s(r: &mut f32, a: m128) { unsafe { _mm_store_ss(r as *mut f32, a.0) } } /// Stores the low lane value to all lanes of the reference given. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]); /// let mut b = zeroed_m128(); /// store_splat_m128(&mut b, a); /// let c = b.to_array(); /// assert_eq!(c, [10.0, 10.0, 10.0, 10.0]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn store_splat_m128(r: &mut m128, a: m128) { unsafe { _mm_store1_ps(r as *mut m128 as *mut f32, a.0) } } /// Stores the value to the reference given in reverse order. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]); /// let mut b = zeroed_m128(); /// store_reverse_m128(&mut b, a); /// let c = b.to_array(); /// assert_eq!(c, [14.0, 13.0, 12.0, 10.0]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn store_reverse_m128(r: &mut m128, a: m128) { unsafe { _mm_storer_ps(r as *mut m128 as *mut f32, a.0) } } /// Stores the value to the reference given. /// /// This generally has no speed penalty if the reference happens to be 16-byte /// aligned, but there is a slight speed penalty if the reference is only 4-byte /// aligned. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([10.0, 12.0, 13.0, 14.0]); /// let mut b = [0.0; 4]; /// store_unaligned_m128(&mut b, a); /// assert_eq!(b, [10.0, 12.0, 13.0, 14.0]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn store_unaligned_m128(r: &mut [f32; 4], a: m128) { unsafe { _mm_storeu_ps(r.as_mut_ptr(), a.0) } } /// Lanewise `a - b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 8.0, 12.0, 3.0]); /// let c = sub_m128(a, b).to_array(); /// assert_eq!(c, [-4.0, -6.0, -9.0, 1.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn sub_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_sub_ps(a.0, b.0) }) } /// Low lane `a - b`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 8.0, 12.0, 3.0]); /// let c = sub_m128_s(a, b).to_array(); /// assert_eq!(c, [-4.0, 2.0, 3.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn sub_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_sub_ss(a.0, b.0) }) } /// Transpose four `m128` as if they were a 4x4 matrix. /// ``` /// # use safe_arch::*; /// let mut a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let mut b = m128::from_array([5.0, 6.0, 7.0, 8.0]); /// let mut c = m128::from_array([9.0, 10.0, 11.0, 12.0]); /// let mut d = m128::from_array([13.0, 14.0, 15.0, 16.0]); /// transpose_four_m128(&mut a, &mut b, &mut c, &mut d); /// assert_eq!(a.to_array(), [1.0, 5.0, 9.0, 13.0]); /// assert_eq!(b.to_array(), [2.0, 6.0, 10.0, 14.0]); /// assert_eq!(c.to_array(), [3.0, 7.0, 11.0, 15.0]); /// assert_eq!(d.to_array(), [4.0, 8.0, 12.0, 16.0]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn transpose_four_m128(a: &mut m128, b: &mut m128, c: &mut m128, d: &mut m128) { unsafe { _MM_TRANSPOSE4_PS(&mut a.0, &mut b.0, &mut c.0, &mut d.0) } } /// Unpack and interleave high lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.0]); /// let c = unpack_high_m128(a, b).to_array(); /// assert_eq!(c, [3.0, 7.0, 4.0, 8.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn unpack_high_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_unpackhi_ps(a.0, b.0) }) } /// Unpack and interleave low lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.0]); /// let c = unpack_low_m128(a, b).to_array(); /// assert_eq!(c, [1.0, 5.0, 2.0, 6.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn unpack_low_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_unpacklo_ps(a.0, b.0) }) } /// Bitwise `a ^ b`. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 0.0, 1.0, 0.0]); /// let b = m128::from_array([1.0, 1.0, 0.0, 0.0]); /// let c = bitxor_m128(a, b).to_array(); /// assert_eq!(c, [0.0, 1.0, 1.0, 0.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse")))] pub fn bitxor_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_xor_ps(a.0, b.0) }) } // // Here we define the Operator Overloads for `m128`. Each one just calls the // correct function from above. By putting the impls here and not with the // `m128` type we theoretically would be able to build the crate safely even if // there's no `sse` feature enabled. You'd just have a `m128` type without the // operator overloads is all. Not that the standard Rust distribution can build // properly without `sse` enabled, but maybe you're using a custom target or // something. It doesn't really put us out of our way, so it doesn't hurt to try // and accommodate the potential use case. // impl Add for m128 { type Output = Self; #[must_use] #[inline(always)] fn add(self, rhs: Self) -> Self { add_m128(self, rhs) } } impl AddAssign for m128 { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = *self + rhs; } } impl BitAnd for m128 { type Output = Self; #[must_use] #[inline(always)] fn bitand(self, rhs: Self) -> Self { bitand_m128(self, rhs) } } impl BitAndAssign for m128 { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = *self & rhs; } } impl BitOr for m128 { type Output = Self; #[must_use] #[inline(always)] fn bitor(self, rhs: Self) -> Self { bitor_m128(self, rhs) } } impl BitOrAssign for m128 { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = *self | rhs; } } impl BitXor for m128 { type Output = Self; #[must_use] #[inline(always)] fn bitxor(self, rhs: Self) -> Self { bitxor_m128(self, rhs) } } impl BitXorAssign for m128 { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = *self ^ rhs; } } impl Div for m128 { type Output = Self; #[must_use] #[inline(always)] fn div(self, rhs: Self) -> Self { div_m128(self, rhs) } } impl DivAssign for m128 { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = *self / rhs; } } impl Mul for m128 { type Output = Self; #[must_use] #[inline(always)] fn mul(self, rhs: Self) -> Self { mul_m128(self, rhs) } } impl MulAssign for m128 { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = *self * rhs; } } impl Neg for m128 { type Output = Self; #[must_use] #[inline(always)] fn neg(self) -> Self { sub_m128(zeroed_m128(), self) } } impl Not for m128 { type Output = Self; /// Not a direct intrinsic, but it's very useful and the implementation is /// simple enough. /// /// Negates the bits by performing an `xor` with an all-1s bit pattern. #[must_use] #[inline(always)] fn not(self) -> Self { let all_bits = set_splat_m128(f32::from_bits(u32::MAX)); self ^ all_bits } } impl Sub for m128 { type Output = Self; #[must_use] #[inline(always)] fn sub(self, rhs: Self) -> Self { sub_m128(self, rhs) } } impl SubAssign for m128 { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = *self - rhs; } } impl PartialEq for m128 { /// Not a direct intrinsic, this is a `cmp_eq_mask` and then a `move_mask`. #[must_use] #[inline(always)] fn eq(&self, other: &Self) -> bool { move_mask_m128(cmp_eq_mask_m128(*self, *other)) == 0b1111 } } safe_arch-0.7.1/src/x86_x64/sse2.rs000066400000000000000000003012741445526200400166020ustar00rootroot00000000000000#![cfg(target_feature = "sse2")] use super::*; /// Lanewise `a + b` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b = m128i::from([0_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, -20, 21, 22, -23, 24, 127]); /// let c: [i8; 16] = add_i8_m128i(a, b).into(); /// assert_eq!(c, [0, 12, 4, 16, 8, 20, 12, 24, 16, 28, -10, 32, 34, -10, 38, -114]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_add_epi8(a.0, b.0) }) } /// Lanewise `a + b` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([5_i16, 6, 7, 8, -15, -26, -37, 48]); /// let c: [i16; 8] = add_i16_m128i(a, b).into(); /// assert_eq!(c, [6, 8, 10, 12, -16, -28, -40, 44]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_add_epi16(a.0, b.0) }) } /// Lanewise `a + b` with lanes as `i32`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = m128i::from([5, 6, 7, 8]); /// let c: [i32; 4] = add_i32_m128i(a, b).into(); /// assert_eq!(c, [6, 8, 10, 12]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_add_epi32(a.0, b.0) }) } /// Lanewise `a + b` with lanes as `i64`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([92_i64, 87]); /// let b = m128i::from([-9001_i64, 1]); /// let c: [i64; 2] = add_i64_m128i(a, b).into(); /// assert_eq!(c, [-8909, 88]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_i64_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_add_epi64(a.0, b.0) }) } /// Lanewise `a + b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 87.5]); /// let b = m128d::from_array([100.0, -6.0]); /// let c = add_m128d(a, b).to_array(); /// assert_eq!(c, [192.0, 81.5]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_add_pd(a.0, b.0) }) } /// Lowest lane `a + b`, high lane unchanged. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 87.5]); /// let b = m128d::from_array([100.0, -600.0]); /// let c = add_m128d_s(a, b).to_array(); /// assert_eq!(c, [192.0, 87.5]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_add_sd(a.0, b.0) }) } /// Lanewise saturating `a + b` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([ /// i8::MAX, i8::MIN, 3, 4, -1, -2, -3, -4, /// 3, 4, -1, -2, -1, -2, -3, -4, /// ]); /// let b = m128i::from([ /// i8::MAX, i8::MIN, 7, 8, -15, -26, -37, 48, /// 7, 8, -15, -26, -15, -26, -37, 48, /// ]); /// let c: [i8; 16] = add_saturating_i8_m128i(a, b).into(); /// assert_eq!( /// c, /// [ /// i8::MAX, i8::MIN, 10, 12, -16, -28, -40, 44, /// 10, 12, -16, -28, -16, -28, -40, 44 /// ] /// ); /// ``` #[must_use] #[inline(always)] #[rustfmt::skip] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_saturating_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_adds_epi8(a.0, b.0) }) } /// Lanewise saturating `a + b` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([i16::MAX, i16::MIN, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([i16::MAX, i16::MIN, 7, 8, -15, -26, -37, 48]); /// let c: [i16; 8] = add_saturating_i16_m128i(a, b).into(); /// assert_eq!(c, [i16::MAX, i16::MIN, 10, 12, -16, -28, -40, 44]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_saturating_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_adds_epi16(a.0, b.0) }) } /// Lanewise saturating `a + b` with lanes as `u8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([ /// u8::MAX, 0, 3, 4, 254, 2, 3, 4, /// 3, 4, 1, 2, 1, 2, 128, 4, /// ]); /// let b = m128i::from([ /// u8::MAX, 0, 7, 8, 15, 26, 37, 48, /// 7, 8, 15, 26, 15, 26, 37, 48, /// ]); /// let c: [u8; 16] = add_saturating_u8_m128i(a, b).into(); /// assert_eq!( /// c, /// [ /// u8::MAX, 0, 10, 12, 255, 28, 40, 52, /// 10, 12, 16, 28, 16, 28, 165, 52 /// ] /// ); /// ``` #[must_use] #[inline(always)] #[rustfmt::skip] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_saturating_u8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_adds_epu8(a.0, b.0) }) } /// Lanewise saturating `a + b` with lanes as `u16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([u16::MAX, 0, 3, 4, 1, 2, 3, 4]); /// let b = m128i::from([u16::MAX, 0, 7, 8, 15, 26, 37, 48]); /// let c: [u16; 8] = add_saturating_u16_m128i(a, b).into(); /// assert_eq!(c, [u16::MAX, 0, 10, 12, 16, 28, 40, 52]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn add_saturating_u16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_adds_epu16(a.0, b.0) }) } /// Bitwise `a & b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = bitand_m128d(a, b).to_array(); /// assert_eq!(c, [1.0, 0.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn bitand_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_and_pd(a.0, b.0) }) } /// Bitwise `a & b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 0, 1, 0]); /// let b = m128i::from([1, 1, 0, 0]); /// let c: [i32; 4] = bitand_m128i(a, b).into(); /// assert_eq!(c, [1, 0, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn bitand_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_and_si128(a.0, b.0) }) } /// Bitwise `(!a) & b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = bitandnot_m128d(a, b).to_array(); /// assert_eq!(c, [0.0, 1.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn bitandnot_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_andnot_pd(a.0, b.0) }) } /// Bitwise `(!a) & b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 0, 1, 0]); /// let b = m128i::from([1, 1, 0, 0]); /// let c: [i32; 4] = bitandnot_m128i(a, b).into(); /// assert_eq!(c, [0, 1, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn bitandnot_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_andnot_si128(a.0, b.0) }) } /// Lanewise average of the `u8` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([ /// u8::MAX, 0, 3, 4, 254, 2, 3, 4, /// 3, 4, 1, 2, 1, 2, 128, 4, /// ]); /// let b = m128i::from([ /// u8::MAX, 0, 7, 8, 15, 26, 37, 48, /// 7, 8, 15, 26, 15, 26, 37, 48, /// ]); /// let c: [u8; 16] = average_u8_m128i(a, b).into(); /// assert_eq!( /// c, /// [ /// u8::MAX, 0, 5, 6, 135, 14, 20, 26, /// 5, 6, 8, 14, 8, 14, 83, 26 /// ] /// ); /// ``` #[must_use] #[inline(always)] #[rustfmt::skip] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn average_u8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_avg_epu8(a.0, b.0) }) } /// Lanewise average of the `u16` values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([u16::MAX, 0, 3, 4, 1, 2, 3, 4]); /// let b = m128i::from([u16::MAX, 0, 7, 8, 15, 26, 37, 48]); /// let c: [u16; 8] = average_u16_m128i(a, b).into(); /// assert_eq!(c, [u16::MAX, 0, 5, 6, 8, 14, 20, 26]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn average_u16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_avg_epu16(a.0, b.0) }) } /// Shifts all bits in the entire register left by a number of **bytes**. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from(0x0000000B_0000000A_0000000F_11111111_u128); /// // /// let b: u128 = byte_shl_imm_u128_m128i::<1>(a).into(); /// assert_eq!(b, 0x00000B00_00000A00_00000F11_11111100); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn byte_shl_imm_u128_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_bslli_si128(a.0, IMM) }) } /// Shifts all bits in the entire register right by a number of **bytes**. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from(0x0000000B_0000000A_0000000F_11111111_u128); /// // /// let c: u128 = byte_shr_imm_u128_m128i::<1>(a).into(); /// assert_eq!(c, 0x00000000_0B000000_0A000000_0F111111); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn byte_shr_imm_u128_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_bsrli_si128(a.0, IMM) }) } /// Bit-preserving cast to `m128` from `m128d` /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.0]); /// let c: [u32; 4] = cast_to_m128_from_m128d(a).to_bits(); /// assert_eq!(c, [0, 0x3FF00000, 0, 0x40000000]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cast_to_m128_from_m128d(a: m128d) -> m128 { m128(unsafe { _mm_castpd_ps(a.0) }) } /// Bit-preserving cast to `m128i` from `m128d` /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.0]); /// let c: [u32; 4] = cast_to_m128i_from_m128d(a).into(); /// assert_eq!(c, [0, 0x3FF00000, 0, 0x40000000]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cast_to_m128i_from_m128d(a: m128d) -> m128i { m128i(unsafe { _mm_castpd_si128(a.0) }) } /// Bit-preserving cast to `m128d` from `m128` /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let c: [u64; 2] = cast_to_m128d_from_m128(a).to_bits(); /// assert_eq!(c, [0x400000003F800000, 0x4080000040400000]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cast_to_m128d_from_m128(a: m128) -> m128d { m128d(unsafe { _mm_castps_pd(a.0) }) } /// Bit-preserving cast to `m128i` from `m128` /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let c: [u32; 4] = cast_to_m128i_from_m128(a).into(); /// assert_eq!(c, [0x3F800000, 0x40000000, 0x40400000, 0x40800000]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cast_to_m128i_from_m128(a: m128) -> m128i { m128i(unsafe { _mm_castps_si128(a.0) }) } /// Bit-preserving cast to `m128d` from `m128i` /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let c: [u64; 2] = cast_to_m128d_from_m128i(a).to_bits(); /// assert_eq!(c, [0x200000001, 0x400000003]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cast_to_m128d_from_m128i(a: m128i) -> m128d { m128d(unsafe { _mm_castsi128_pd(a.0) }) } /// Bit-preserving cast to `m128` from `m128i` /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let c: [u32; 4] = cast_to_m128_from_m128i(a).to_bits(); /// assert_eq!(c, [1, 2, 3, 4]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cast_to_m128_from_m128i(a: m128i) -> m128 { m128(unsafe { _mm_castsi128_ps(a.0) }) } /// Lanewise `a == b` with lanes as `i8`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]); /// let b = m128i::from([0_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, -20, 21, 22, -23, 24, 127]); /// let c: [i8; 16] = cmp_eq_mask_i8_m128i(a, b).into(); /// assert_eq!(c, [-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, -1]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_eq_mask_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmpeq_epi8(a.0, b.0) }) } /// Lanewise `a == b` with lanes as `i16`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([5_i16, 2, 7, 4, -15, -26, -37, -4]); /// let c: [i16; 8] = cmp_eq_mask_i16_m128i(a, b).into(); /// assert_eq!(c, [0, -1, 0, -1, 0, 0, 0, -1]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_eq_mask_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmpeq_epi16(a.0, b.0) }) } /// Lanewise `a == b` with lanes as `i32`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = m128i::from([5, 2, 7, 4]); /// let c: [i32; 4] = cmp_eq_mask_i32_m128i(a, b).into(); /// assert_eq!(c, [0, -1, 0, -1]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_eq_mask_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmpeq_epi32(a.0, b.0) }) } /// Lanewise `a == b`, mask output. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_eq_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_eq_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpeq_pd(a.0, b.0) }) } /// Low lane `a == b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_eq_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_eq_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpeq_sd(a.0, b.0) }) } /// Lanewise `a >= b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([3.0, 1.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_ge_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, u64::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_ge_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpge_pd(a.0, b.0) }) } /// Low lane `a >= b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_ge_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_ge_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpge_sd(a.0, b.0) }) } /// Lanewise `a > b` with lanes as `i8`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i8, 1, 20, 3, 40, 5, 60, 7, 80, 9, 10, 11, 12, 13, 14, 127]); /// let b = m128i::from([0_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, -20, 21, 22, -23, 24, 120]); /// let c: [i8; 16] = cmp_gt_mask_i8_m128i(a, b).into(); /// assert_eq!(c, [-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_gt_mask_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmpgt_epi8(a.0, b.0) }) } /// Lanewise `a > b` with lanes as `i16`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 20, 3, 40, -1, -2, -3, 0]); /// let b = m128i::from([5_i16, 2, 7, 4, -15, -26, -37, -4]); /// let c: [i16; 8] = cmp_gt_mask_i16_m128i(a, b).into(); /// assert_eq!(c, [0, -1, 0, -1, -1, -1, -1, -1]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_gt_mask_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmpgt_epi16(a.0, b.0) }) } /// Lanewise `a > b` with lanes as `i32`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 20, 7, 40]); /// let b = m128i::from([5, 2, 7, 4]); /// let c: [i32; 4] = cmp_gt_mask_i32_m128i(a, b).into(); /// assert_eq!(c, [0, -1, 0, -1]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_gt_mask_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmpgt_epi32(a.0, b.0) }) } /// Lanewise `a > b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_gt_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_gt_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpgt_pd(a.0, b.0) }) } /// Low lane `a > b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_gt_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_gt_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpgt_sd(a.0, b.0) }) } /// Lanewise `a <= b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([0.0, 1.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_le_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, u64::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_le_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmple_pd(a.0, b.0) }) } /// Low lane `a <= b`, other lanes unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([0.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_le_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_le_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmple_sd(a.0, b.0) }) } /// Lanewise `a < b` with lanes as `i8`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i8, 1, 20, 3, 40, 5, 60, 7, 80, 9, 10, 11, 12, 13, 14, 127]); /// let b = m128i::from([0_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, -20, 21, 22, -23, 24, 120]); /// let c: [i8; 16] = cmp_lt_mask_i8_m128i(a, b).into(); /// assert_eq!(c, [0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -1, 0, -1, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_lt_mask_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmplt_epi8(a.0, b.0) }) } /// Lanewise `a < b` with lanes as `i16`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 20, 3, 40, -1, -2, -3, 0]); /// let b = m128i::from([5_i16, 2, 7, 4, -15, -26, -37, -4]); /// let c: [i16; 8] = cmp_lt_mask_i16_m128i(a, b).into(); /// assert_eq!(c, [-1, 0, -1, 0, 0, 0, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_lt_mask_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmplt_epi16(a.0, b.0) }) } /// Lanewise `a < b` with lanes as `i32`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 20, 7, 40]); /// let b = m128i::from([5, 2, 7, 4]); /// let c: [i32; 4] = cmp_lt_mask_i32_m128i(a, b).into(); /// assert_eq!(c, [-1, 0, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_lt_mask_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmplt_epi32(a.0, b.0) }) } /// Lanewise `a < b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([0.0, 7.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_lt_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_lt_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmplt_pd(a.0, b.0) }) } /// Low lane `a < b`, other lane unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([0.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_lt_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_lt_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmplt_sd(a.0, b.0) }) } /// Lanewise `a != b`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([3.0, 1.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_neq_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_neq_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpneq_pd(a.0, b.0) }) } /// Low lane `a != b`, other lane unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_neq_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_neq_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpneq_sd(a.0, b.0) }) } /// Lanewise `!(a >= b)`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([3.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_nge_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [0, u64::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_nge_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpnge_pd(a.0, b.0) }) } /// Low lane `!(a >= b)`, other lane unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_nge_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [0, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_nge_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpnge_sd(a.0, b.0) }) } /// Lanewise `!(a > b)`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([3.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_ngt_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [0, u64::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_ngt_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpngt_pd(a.0, b.0) }) } /// Low lane `!(a > b)`, other lane unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_ngt_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [0, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_ngt_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpngt_sd(a.0, b.0) }) } /// Lanewise `!(a <= b)`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([3.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_nle_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_nle_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpnle_pd(a.0, b.0) }) } /// Low lane `!(a <= b)`, other lane unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_nle_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_nle_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpnle_sd(a.0, b.0) }) } /// Lanewise `!(a < b)`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([3.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_nlt_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_nlt_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpnlt_pd(a.0, b.0) }) } /// Low lane `!(a < b)`, other lane unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_nlt_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_nlt_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpnlt_sd(a.0, b.0) }) } /// Lanewise `(!a.is_nan()) & (!b.is_nan())`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([3.0, f64::NAN]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_ordered_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_ordered_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpord_pd(a.0, b.0) }) } /// Low lane `(!a.is_nan()) & (!b.is_nan())`, other lane unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([2.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_ordered_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_ordered_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpord_sd(a.0, b.0) }) } /// Lanewise `a.is_nan() | b.is_nan()`. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([f64::NAN, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_unord_mask_m128d(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_unord_mask_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpunord_pd(a.0, b.0) }) } /// Low lane `a.is_nan() | b.is_nan()`, other lane unchanged. /// /// Mask output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([f64::NAN, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = cmp_unord_mask_m128d_s(a, b).to_bits(); /// assert_eq!(c, [u64::MAX, 5_f64.to_bits()]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_unord_mask_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_cmpunord_sd(a.0, b.0) }) } /// Low lane `f64` equal to. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// assert_eq!(1_i32, cmp_eq_i32_m128d_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_eq_i32_m128d_s(a: m128d, b: m128d) -> i32 { unsafe { _mm_comieq_sd(a.0, b.0) } } /// Low lane `f64` greater than or equal to. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// assert_eq!(1_i32, cmp_ge_i32_m128d_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_ge_i32_m128d_s(a: m128d, b: m128d) -> i32 { unsafe { _mm_comige_sd(a.0, b.0) } } /// Low lane `f64` greater than. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// assert_eq!(1_i32, cmp_ge_i32_m128d_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_gt_i32_m128d_s(a: m128d, b: m128d) -> i32 { unsafe { _mm_comigt_sd(a.0, b.0) } } /// Low lane `f64` less than or equal to. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// assert_eq!(1_i32, cmp_le_i32_m128d_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_le_i32_m128d_s(a: m128d, b: m128d) -> i32 { unsafe { _mm_comile_sd(a.0, b.0) } } /// Low lane `f64` less than. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([0.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// assert_eq!(1_i32, cmp_lt_i32_m128d_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_lt_i32_m128d_s(a: m128d, b: m128d) -> i32 { unsafe { _mm_comilt_sd(a.0, b.0) } } /// Low lane `f64` less than. /// /// `i32` output. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([0.0, 5.0]); /// let b = m128d::from_array([1.0, 1.0]); /// assert_eq!(1_i32, cmp_neq_i32_m128d_s(a, b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn cmp_neq_i32_m128d_s(a: m128d, b: m128d) -> i32 { unsafe { _mm_comineq_sd(a.0, b.0) } } /// Rounds the lower two `i32` lanes to two `f64` lanes. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = convert_to_m128d_from_lower2_i32_m128i(a); /// let c = m128d::from_array([1.0, 2.0]); /// assert_eq!(b.to_bits(), c.to_bits()); /// ``` /// * **Intrinsic:** [`_mm_cvtepi32_pd`] /// * **Assembly:** `cvtdq2pd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_to_m128d_from_lower2_i32_m128i(a: m128i) -> m128d { m128d(unsafe { _mm_cvtepi32_pd(a.0) }) } /// Rounds the four `i32` lanes to four `f32` lanes. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = convert_to_m128_from_i32_m128i(a); /// let c = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// assert_eq!(b.to_bits(), c.to_bits()); /// ``` /// * **Intrinsic:** [`_mm_cvtepi32_ps`] /// * **Assembly:** `cvtdq2ps xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_to_m128_from_i32_m128i(a: m128i) -> m128 { m128(unsafe { _mm_cvtepi32_ps(a.0) }) } /// Rounds the two `f64` lanes to the low two `i32` lanes. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.5]); /// let b = convert_to_i32_m128i_from_m128d(a); /// let c: [i32; 4] = b.into(); /// assert_eq!(c, [1, 2, 0, 0]); /// ``` /// * **Intrinsic:** [`_mm_cvtpd_epi32`] /// * **Assembly:** `cvtpd2dq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_to_i32_m128i_from_m128d(a: m128d) -> m128i { m128i(unsafe { _mm_cvtpd_epi32(a.0) }) } /// Rounds the two `f64` lanes to the low two `f32` lanes. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.5]); /// let b = convert_to_m128_from_m128d(a); /// assert_eq!(b.to_bits(), [1_f32.to_bits(), 2.5_f32.to_bits(), 0, 0]); /// ``` /// * **Intrinsic:** [`_mm_cvtpd_ps`] /// * **Assembly:** `cvtpd2ps xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_to_m128_from_m128d(a: m128d) -> m128 { m128(unsafe { _mm_cvtpd_ps(a.0) }) } /// Rounds the `f32` lanes to `i32` lanes. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.5, 3.0, 4.0]); /// let b = convert_to_i32_m128i_from_m128(a); /// let c: [i32; 4] = b.into(); /// assert_eq!(c, [1, 2, 3, 4]); /// ``` /// * **Intrinsic:** [`_mm_cvtps_epi32`] /// * **Assembly:** `cvtps2dq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_to_i32_m128i_from_m128(a: m128) -> m128i { m128i(unsafe { _mm_cvtps_epi32(a.0) }) } /// Rounds the two `f64` lanes to the low two `f32` lanes. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.5, 3.6, 4.7]); /// let b = convert_to_m128d_from_lower2_m128(a); /// assert_eq!(b.to_bits(), [1_f64.to_bits(), 2.5_f64.to_bits()]); /// ``` /// * **Intrinsic:** [`_mm_cvtps_pd`] /// * **Assembly:** `cvtps2pd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_to_m128d_from_lower2_m128(a: m128) -> m128d { m128d(unsafe { _mm_cvtps_pd(a.0) }) } /// Gets the lower lane as an `f64` value. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.5]); /// let b = get_f64_from_m128d_s(a); /// assert_eq!(b, 1.0_f64); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn get_f64_from_m128d_s(a: m128d) -> f64 { unsafe { _mm_cvtsd_f64(a.0) } } /// Converts the lower lane to an `i32` value. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.5]); /// let b = get_i32_from_m128d_s(a); /// assert_eq!(b, 1_i32); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn get_i32_from_m128d_s(a: m128d) -> i32 { unsafe { _mm_cvtsd_si32(a.0) } } /// Converts the lower lane to an `i64` value. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.5]); /// let b = get_i64_from_m128d_s(a); /// assert_eq!(b, 1_i64); /// ``` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn get_i64_from_m128d_s(a: m128d) -> i64 { unsafe { _mm_cvtsd_si64(a.0) } } /// Converts the low `f64` to `f32` and replaces the low lane of the input. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([3.0, 4.0, 5.0, 6.0]); /// let b = m128d::from_array([1.0, 2.5]); /// let c = convert_m128d_s_replace_m128_s(a, b); /// assert_eq!(c.to_array(), [1.0, 4.0, 5.0, 6.0]); /// ``` /// * **Intrinsic:** [`_mm_cvtsd_ss`] /// * **Assembly:** `cvtsd2ss xmm, xmm` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_m128d_s_replace_m128_s(a: m128, b: m128d) -> m128 { m128(unsafe { _mm_cvtsd_ss(a.0, b.0) }) } /// Converts the lower lane to an `i32` value. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 3, 5, 7]); /// let b = get_i32_from_m128i_s(a); /// assert_eq!(b, 1_i32); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn get_i32_from_m128i_s(a: m128i) -> i32 { unsafe { _mm_cvtsi128_si32(a.0) } } /// Converts the lower lane to an `i64` value. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i64, 3]); /// let b = get_i64_from_m128i_s(a); /// assert_eq!(b, 1_i64); /// ``` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn get_i64_from_m128i_s(a: m128i) -> i64 { unsafe { _mm_cvtsi128_si64(a.0) } } /// Convert `i32` to `f64` and replace the low lane of the input. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.0]); /// let b = convert_i32_replace_m128d_s(a, 5_i32); /// assert_eq!(b.to_array(), [5.0, 2.0]); /// ``` /// * **Intrinsic:** [`_mm_cvtsi32_sd`] /// * **Assembly:** `cvtsi2sd xmm, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_i32_replace_m128d_s(a: m128d, i: i32) -> m128d { m128d(unsafe { _mm_cvtsi32_sd(a.0, i) }) } /// Set an `i32` as the low 32-bit lane of an `m128i`, other lanes blank. /// ``` /// # use safe_arch::*; /// let a: [i32; 4] = set_i32_m128i_s(1_i32).into(); /// let b: [i32; 4] = m128i::from([1, 0, 0, 0]).into(); /// assert_eq!(a, b); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_i32_m128i_s(i: i32) -> m128i { m128i(unsafe { _mm_cvtsi32_si128(i) }) } /// Convert `i64` to `f64` and replace the low lane of the input. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.0]); /// let b = convert_i64_replace_m128d_s(a, 5_i64); /// assert_eq!(b.to_array(), [5.0, 2.0]); /// ``` /// * **Intrinsic:** [`_mm_cvtsi64_sd`] /// * **Assembly:** `cvtsi2sd xmm, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_i64_replace_m128d_s(a: m128d, i: i64) -> m128d { m128d(unsafe { _mm_cvtsi64_sd(a.0, i) }) } /// Set an `i64` as the low 64-bit lane of an `m128i`, other lanes blank. /// ``` /// # use safe_arch::*; /// let a: [i64; 2] = set_i64_m128i_s(1_i64).into(); /// let b: [i64; 2] = m128i::from([1_i64, 0]).into(); /// assert_eq!(a, b); /// ``` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_i64_m128i_s(i: i64) -> m128i { m128i(unsafe { _mm_cvtsi64_si128(i) }) } /// Converts the lower `f32` to `f64` and replace the low lane of the input /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.5]); /// let b = m128::from_array([3.0, 4.0, 5.0, 6.0]); /// let c = convert_m128_s_replace_m128d_s(a, b); /// assert_eq!(c.to_array(), [3.0, 2.5]); /// ``` /// * **Intrinsic:** [`_mm_cvtss_sd`] /// * **Assembly:** `cvtss2sd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn convert_m128_s_replace_m128d_s(a: m128d, b: m128) -> m128d { m128d(unsafe { _mm_cvtss_sd(a.0, b.0) }) } /// Truncate the `f64` lanes to the lower `i32` lanes (upper `i32` lanes 0). /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.1, 2.6]); /// let b = truncate_m128d_to_m128i(a); /// assert_eq!(<[i32; 4]>::from(b), [1, 2, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn truncate_m128d_to_m128i(a: m128d) -> m128i { m128i(unsafe { _mm_cvttpd_epi32(a.0) }) } /// Truncate the `f32` lanes to `i32` lanes. /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.1, 2.6, 3.5, 4.0]); /// let b = truncate_m128_to_m128i(a); /// assert_eq!(<[i32; 4]>::from(b), [1, 2, 3, 4]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn truncate_m128_to_m128i(a: m128) -> m128i { m128i(unsafe { _mm_cvttps_epi32(a.0) }) } /// Truncate the lower lane into an `i32`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.7, 2.6]); /// assert_eq!(truncate_to_i32_m128d_s(a), 1_i32); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn truncate_to_i32_m128d_s(a: m128d) -> i32 { unsafe { _mm_cvttsd_si32(a.0) } } /// Truncate the lower lane into an `i64`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.7, 2.6]); /// assert_eq!(truncate_to_i64_m128d_s(a), 1_i64); /// ``` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn truncate_to_i64_m128d_s(a: m128d) -> i64 { unsafe { _mm_cvttsd_si64(a.0) } } /// Lanewise `a / b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 42.0]); /// let b = m128d::from_array([100.0, -6.0]); /// let c = div_m128d(a, b).to_array(); /// assert_eq!(c, [0.92, -7.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn div_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_div_pd(a.0, b.0) }) } /// Lowest lane `a / b`, high lane unchanged. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 87.5]); /// let b = m128d::from_array([100.0, -600.0]); /// let c = div_m128d_s(a, b).to_array(); /// assert_eq!(c, [0.92, 87.5]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn div_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_div_sd(a.0, b.0) }) } /// Gets an `i16` value out of an `m128i`, returns as `i32`. /// /// The lane to get must be a constant in `0..8`. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([0xA_i16, 0xB, 0xC, 0xD, 0, 0, 0, 0]); /// // /// assert_eq!(extract_i16_as_i32_m128i::<0>(a), 0xA); /// assert_eq!(extract_i16_as_i32_m128i::<1>(a), 0xB); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn extract_i16_as_i32_m128i(a: m128i) -> i32 { unsafe { _mm_extract_epi16(a.0, LANE) } } /// Inserts the low 16 bits of an `i32` value into an `m128i`. /// /// The lane to get must be a constant in `0..8`. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([0xA_i16, 0xB, 0xC, 0xD, 0, 0, 0, 0]); /// // /// let b = insert_i16_from_i32_m128i::<0>(a, -1); /// assert_eq!(<[i16; 8]>::from(b), [-1, 0xB, 0xC, 0xD, 0, 0, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn insert_i16_from_i32_m128i(a: m128i, i: i32) -> m128i { m128i(unsafe { _mm_insert_epi16(a.0, i, LANE) }) } /// Loads the reference into a register. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([10.0, 12.0]); /// let b = load_m128d(&a); /// assert_eq!(a.to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_m128d(a: &m128d) -> m128d { m128d(unsafe { _mm_load_pd(a as *const m128d as *const f64) }) } /// Loads the `f64` reference into all lanes of a register. /// ``` /// # use safe_arch::*; /// let a = 1.0; /// let b = load_f64_splat_m128d(&a); /// assert_eq!(m128d::from_array([1.0, 1.0]).to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[allow(clippy::trivially_copy_pass_by_ref)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_f64_splat_m128d(a: &f64) -> m128d { m128d(unsafe { _mm_load1_pd(a) }) } /// Loads the reference into the low lane of the register. /// ``` /// # use safe_arch::*; /// let a = 1.0; /// let b = load_f64_m128d_s(&a); /// assert_eq!(m128d::from_array([1.0, 0.0]).to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[allow(clippy::trivially_copy_pass_by_ref)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_f64_m128d_s(a: &f64) -> m128d { m128d(unsafe { _mm_load_sd(a) }) } /// Loads the reference into a register. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = load_m128i(&a); /// assert_eq!(<[i32; 4]>::from(a), <[i32; 4]>::from(b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_m128i(a: &m128i) -> m128i { m128i(unsafe { _mm_load_si128(a as *const m128i as *const __m128i) }) } /// Loads the reference into a register, replacing the high lane. /// ``` /// # use safe_arch::*; /// let a = m128d::from([1.0, 2.0]); /// let double = 7.0; /// let b = load_replace_high_m128d(a, &double); /// assert_eq!(b.to_array(), [1.0, 7.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_replace_high_m128d(a: m128d, b: &f64) -> m128d { m128d(unsafe { _mm_loadh_pd(a.0, b) }) } /// Loads the low `i64` into a register. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i64, 2]); /// let b = load_i64_m128i_s(&a); /// assert_eq!([1_i64, 0], <[i64; 2]>::from(b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_i64_m128i_s(a: &m128i) -> m128i { m128i(unsafe { _mm_loadl_epi64(a as *const m128i as *const __m128i) }) } /// Loads the reference into a register, replacing the low lane. /// ``` /// # use safe_arch::*; /// let a = m128d::from([1.0, 2.0]); /// let double = 7.0; /// let b = load_replace_low_m128d(a, &double); /// assert_eq!(b.to_array(), [7.0, 2.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_replace_low_m128d(a: m128d, b: &f64) -> m128d { m128d(unsafe { _mm_loadl_pd(a.0, b) }) } /// Loads the reference into a register with reversed order. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([10.0, 12.0]); /// let b = load_reverse_m128d(&a); /// assert_eq!(m128d::from_array([12.0, 10.0]).to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_reverse_m128d(a: &m128d) -> m128d { m128d(unsafe { _mm_loadr_pd(a as *const m128d as *const f64) }) } /// Loads the reference into a register. /// /// This generally has no speed penalty if the reference happens to be 16-byte /// aligned, but there is a slight speed penalty if the reference is only 8-byte /// aligned. /// ``` /// # use safe_arch::*; /// let a = [10.0, 12.0]; /// let b = load_unaligned_m128d(&a); /// assert_eq!(m128d::from_array(a).to_bits(), b.to_bits()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_unaligned_m128d(a: &[f64; 2]) -> m128d { m128d(unsafe { _mm_loadu_pd(a as *const [f64; 2] as *const f64) }) } /// Loads the reference into a register. /// /// This generally has no speed penalty if the reference happens to be 16-byte /// aligned, but there is a slight speed penalty if the reference is less /// aligned. /// ``` /// # use safe_arch::*; /// let a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; /// let b = load_unaligned_m128i(&a); /// assert_eq!(a, <[u8; 16]>::from(b)); /// ``` #[must_use] #[inline(always)] #[allow(clippy::cast_ptr_alignment)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn load_unaligned_m128i(a: &[u8; 16]) -> m128i { m128i(unsafe { _mm_loadu_si128(a as *const [u8; 16] as *const __m128i) }) } /// Multiply `i16` lanes producing `i32` values, horizontal add pairs of `i32` /// values to produce the final output. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([5_i16, 6, 7, 8, -15, -26, -37, 48]); /// let c: [i32; 4] = mul_i16_horizontal_add_m128i(a, b).into(); /// assert_eq!(c, [17, 53, 67, -81]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn mul_i16_horizontal_add_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_madd_epi16(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `u8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b = m128i::from([0_u8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127]); /// let c: [u8; 16] = max_u8_m128i(a, b).into(); /// assert_eq!(c, [0, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn max_u8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_max_epu8(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([5_i16, 6, 7, 8, -15, -26, -37, 48]); /// let c: [i16; 8] = max_i16_m128i(a, b).into(); /// assert_eq!(c, [5_i16, 6, 7, 8, -1, -2, -3, 48]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn max_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_max_epi16(a.0, b.0) }) } /// Lanewise `max(a, b)`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([5.0, 2.0]); /// let b = m128d::from_array([1.0, 6.0]); /// let c = max_m128d(a, b).to_array(); /// assert_eq!(c, [5.0, 6.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn max_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_max_pd(a.0, b.0) }) } /// Low lane `max(a, b)`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 12.0]); /// let b = m128d::from_array([5.0, 6.0]); /// let c = max_m128d_s(a, b).to_array(); /// assert_eq!(c, [5.0, 12.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn max_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_max_sd(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `u8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b = m128i::from([0_u8, 11, 2, 13, 4, 15, 6, 17, 8, 0, 20, 0, 22, 0, 24, 0]); /// let c: [u8; 16] = min_u8_m128i(a, b).into(); /// assert_eq!(c, [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 0, 10, 0, 12, 0, 14, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn min_u8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_min_epu8(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([5_i16, 6, 7, 8, -15, -26, -37, 48]); /// let c: [i16; 8] = min_i16_m128i(a, b).into(); /// assert_eq!(c, [1_i16, 2, 3, 4, -15, -26, -37, -4]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn min_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_min_epi16(a.0, b.0) }) } /// Lanewise `min(a, b)`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 12.0]); /// let b = m128d::from_array([5.0, 6.0]); /// let c = min_m128d(a, b).to_array(); /// assert_eq!(c, [1.0, 6.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn min_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_min_pd(a.0, b.0) }) } /// Low lane `min(a, b)`, other lanes unchanged. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 12.0]); /// let b = m128d::from_array([0.0, 6.0]); /// let c = min_m128d_s(a, b).to_array(); /// assert_eq!(c, [0.0, 12.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn min_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_min_sd(a.0, b.0) }) } /// Copy the low `i64` lane to a new register, upper bits 0. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i64, 2]); /// let b = copy_i64_m128i_s(a); /// assert_eq!(<[i64; 2]>::from(b), [1, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn copy_i64_m128i_s(a: m128i) -> m128i { m128i(unsafe { _mm_move_epi64(a.0) }) } /// Copies the `a` value and replaces the low lane with the low `b` value. /// ``` /// # use safe_arch::*; /// let a = m128d::from([1.0, 2.0]); /// let b = m128d::from([3.0, 4.0]); /// let c = copy_replace_low_f64_m128d(a, b); /// assert_eq!(c.to_array(), [3.0, 2.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn copy_replace_low_f64_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_move_sd(a.0, b.0) }) } /// Gathers the `i8` sign bit of each lane. /// /// The output has lane 0 as bit 0, lane 1 as bit 1, and so on. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, -11, -2, 13, 4, 15, -6, 17, 8, 19, -20, 21, 22, 23, -24, 127]); /// let i = move_mask_i8_m128i(a); /// assert_eq!(i, 0b0100010001000110); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn move_mask_i8_m128i(a: m128i) -> i32 { unsafe { _mm_movemask_epi8(a.0) } } /// Gathers the sign bit of each lane. /// /// The output has lane 0 as bit 0, lane 1 as bit 1. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([-1.0, 12.0]); /// let i = move_mask_m128d(a); /// assert_eq!(i, 0b01); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn move_mask_m128d(a: m128d) -> i32 { unsafe { _mm_movemask_pd(a.0) } } /// Multiplies the odd `u32` lanes and gives the widened (`u64`) results. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 7, u32::MAX, 7]); /// let b = m128i::from([5, 7, u32::MAX, 7]); /// let c: [u64; 2] = mul_widen_u32_odd_m128i(a, b).into(); /// assert_eq!(c, [(1 * 5), (u32::MAX as u64 * u32::MAX as u64)]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn mul_widen_u32_odd_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_mul_epu32(a.0, b.0) }) } /// Lanewise `a * b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 87.5]); /// let b = m128d::from_array([100.0, -6.0]); /// let c = mul_m128d(a, b).to_array(); /// assert_eq!(c, [9200.0, -525.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn mul_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_mul_pd(a.0, b.0) }) } /// Lowest lane `a * b`, high lane unchanged. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 87.5]); /// let b = m128d::from_array([100.0, -600.0]); /// let c = mul_m128d_s(a, b).to_array(); /// assert_eq!(c, [9200.0, 87.5]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn mul_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_mul_sd(a.0, b.0) }) } /// Lanewise `a * b` with lanes as `i16`, keep the high bits of the `i32` /// intermediates. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 200, 300, 4568, -1, -2, -3, -4]); /// let b = m128i::from([5_i16, 600, 700, 8910, -15, -26, -37, 48]); /// let c: [i16; 8] = mul_i16_keep_high_m128i(a, b).into(); /// assert_eq!(c, [0, 1, 3, 621, 0, 0, 0, -1]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn mul_i16_keep_high_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_mulhi_epi16(a.0, b.0) }) } /// Lanewise `a * b` with lanes as `u16`, keep the high bits of the `u32` /// intermediates. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u16, 2003, 3005, 45687, 1, 2, 3, 4]); /// let b = m128i::from([5_u16, 6004, 7006, 8910, 15, 26, 37, 48]); /// let c: [u16; 8] = mul_u16_keep_high_m128i(a, b).into(); /// assert_eq!(c, [0, 183, 321, 6211, 0, 0, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn mul_u16_keep_high_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_mulhi_epu16(a.0, b.0) }) } /// Lanewise `a * b` with lanes as `i16`, keep the low bits of the `i32` /// intermediates. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 200, 300, 4568, -1, -2, -3, -4]); /// let b = m128i::from([5_i16, 600, 700, 8910, -15, -26, -37, 48]); /// let c: [i16; 8] = mul_i16_keep_low_m128i(a, b).into(); /// assert_eq!(c, [5, -11072, 13392, 3024, 15, 52, 111, -192]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn mul_i16_keep_low_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_mullo_epi16(a.0, b.0) }) } /// Bitwise `a | b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = bitor_m128d(a, b).to_array(); /// assert_eq!(c, [1.0, 1.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn bitor_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_or_pd(a.0, b.0) }) } /// Bitwise `a | b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 0, 1, 0]); /// let b = m128i::from([1, 1, 0, 0]); /// let c: [i32; 4] = bitor_m128i(a, b).into(); /// assert_eq!(c, [1, 1, 1, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn bitor_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_or_si128(a.0, b.0) }) } /// Saturating convert `i16` to `i8`, and pack the values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, 5, 6, 7, 8]); /// let b = m128i::from([9_i16, 10, 11, 12, 13, 14, 15, 16]); /// let c: [i8; 16] = pack_i16_to_i8_m128i(a, b).into(); /// assert_eq!(c, [1_i8, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn pack_i16_to_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_packs_epi16(a.0, b.0) }) } /// Saturating convert `i32` to `i16`, and pack the values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i32, 2, 3, 4]); /// let b = m128i::from([5_i32, 6, 7, 8]); /// let c: [i16; 8] = pack_i32_to_i16_m128i(a, b).into(); /// assert_eq!(c, [1_i16, 2, 3, 4, 5, 6, 7, 8]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn pack_i32_to_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_packs_epi32(a.0, b.0) }) } /// Saturating convert `i16` to `u8`, and pack the values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([-1_i16, 2, -3, 4, -5, 6, -7, 8]); /// let b = m128i::from([9_i16, 10, 11, 12, 13, -14, 15, -16]); /// let c: [u8; 16] = pack_i16_to_i8_m128i(a, b).into(); /// assert_eq!(c, [255_u8, 2, 253, 4, 251, 6, 249, 8, 9, 10, 11, 12, 13, 242, 15, 240]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn pack_i16_to_u8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_packus_epi16(a.0, b.0) }) } /// Compute "sum of `u8` absolute differences". /// /// * `u8` lanewise `abs(a - b)`, producing `u8` intermediate values. /// * Sum the first eight and second eight values. /// * Place into the low 16 bits of two `u64` lanes. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_u8, 11, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127]); /// let b = m128i::from([20_u8, 110, 250, 103, 34, 105, 60, 217, 8, 19, 210, 201, 202, 203, 204, 127]); /// let c: [u64; 2] = sum_of_u8_abs_diff_m128i(a, b).into(); /// assert_eq!(c, [831_u64, 910]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sum_of_u8_abs_diff_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_sad_epu8(a.0, b.0) }) } /// Sets the args into an `m128i`, first arg is the high lane. /// ``` /// # use safe_arch::*; /// let a = m128i::from([15_i8, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]); /// let b = set_i8_m128i(0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); /// assert_eq!(<[i8; 16]>::from(a), <[i8; 16]>::from(b)); /// ``` #[must_use] #[inline(always)] #[allow(clippy::too_many_arguments)] #[allow(clippy::many_single_char_names)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_i8_m128i(a: i8, b: i8, c: i8, d: i8, e: i8, f: i8, g: i8, h: i8, i: i8, j: i8, k: i8, l: i8, m: i8, n: i8, o: i8, p: i8) -> m128i { m128i(unsafe { _mm_set_epi8(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) }) } /// Sets the args into an `m128i`, first arg is the high lane. /// ``` /// # use safe_arch::*; /// let a = m128i::from([7_i16, 6, 5, 4, 3, 2, 1, 0]); /// let b = set_i16_m128i(0_i16, 1, 2, 3, 4, 5, 6, 7); /// assert_eq!(<[i16; 8]>::from(a), <[i16; 8]>::from(b)); /// ``` #[must_use] #[inline(always)] #[allow(clippy::too_many_arguments)] #[allow(clippy::many_single_char_names)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_i16_m128i(a: i16, b: i16, c: i16, d: i16, e: i16, f: i16, g: i16, h: i16) -> m128i { m128i(unsafe { _mm_set_epi16(a, b, c, d, e, f, g, h) }) } /// Sets the args into an `m128i`, first arg is the high lane. /// ``` /// # use safe_arch::*; /// let a = m128i::from([3, 2, 1, 0]); /// let b = set_i32_m128i(0, 1, 2, 3); /// assert_eq!(<[i32; 4]>::from(a), <[i32; 4]>::from(b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_i32_m128i(a: i32, b: i32, c: i32, d: i32) -> m128i { m128i(unsafe { _mm_set_epi32(a, b, c, d) }) } /// Sets the args into an `m128i`, first arg is the high lane. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i64, 0]); /// let b = set_i64_m128i(0, 1); /// assert_eq!(<[i64; 2]>::from(a), <[i64; 2]>::from(b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_i64_m128i(a: i64, b: i64) -> m128i { m128i(unsafe { _mm_set_epi64x(a, b) }) } /// Sets the args into an `m128d`, first arg is the high lane. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 0.0]); /// let b = set_m128d(0.0, 1.0); /// assert_eq!(a.to_array(), b.to_array()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_m128d(a: f64, b: f64) -> m128d { m128d(unsafe { _mm_set_pd(a, b) }) } /// Sets the args into the low lane of a `m128d`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 0.0]); /// let b = set_m128d_s(1.0); /// assert_eq!(a.to_array(), b.to_array()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_m128d_s(a: f64) -> m128d { m128d(unsafe { _mm_set_sd(a) }) } /// Splats the args into both lanes of the `m128d`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 1.0]); /// let b = set_splat_m128d(1.0); /// assert_eq!(a.to_array(), b.to_array()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_splat_m128d(a: f64) -> m128d { m128d(unsafe { _mm_set1_pd(a) }) } /// Splats the `i8` to all lanes of the `m128i`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i8, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]); /// let b = set_splat_i8_m128i(1); /// assert_eq!(<[i8; 16]>::from(a), <[i8; 16]>::from(a)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_splat_i8_m128i(i: i8) -> m128i { m128i(unsafe { _mm_set1_epi8(i) }) } /// Splats the `i16` to all lanes of the `m128i`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 1, 1, 1, 1, 1, 1, 1]); /// let b = set_splat_i16_m128i(1); /// assert_eq!(<[i16; 8]>::from(a), <[i16; 8]>::from(a)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_splat_i16_m128i(i: i16) -> m128i { m128i(unsafe { _mm_set1_epi16(i) }) } /// Splats the `i32` to all lanes of the `m128i`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 1, 1, 1]); /// let b = set_splat_i32_m128i(1); /// assert_eq!(<[i32; 4]>::from(a), <[i32; 4]>::from(a)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_splat_i32_m128i(i: i32) -> m128i { m128i(unsafe { _mm_set1_epi32(i) }) } /// Splats the `i64` to both lanes of the `m128i`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i64, 1]); /// let b = set_splat_i64_m128i(1); /// assert_eq!(<[i64; 2]>::from(a), <[i64; 2]>::from(a)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_splat_i64_m128i(i: i64) -> m128i { m128i(unsafe { _mm_set1_epi64x(i) }) } /// Sets the args into an `m128i`, first arg is the low lane. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b = set_reversed_i8_m128i(0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); /// assert_eq!(<[i8; 16]>::from(a), <[i8; 16]>::from(b)); /// ``` #[must_use] #[inline(always)] #[allow(clippy::too_many_arguments)] #[allow(clippy::many_single_char_names)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_reversed_i8_m128i(a: i8, b: i8, c: i8, d: i8, e: i8, f: i8, g: i8, h: i8, i: i8, j: i8, k: i8, l: i8, m: i8, n: i8, o: i8, p: i8) -> m128i { m128i(unsafe { _mm_setr_epi8(a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p) }) } /// Sets the args into an `m128i`, first arg is the low lane. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i16, 1, 2, 3, 4, 5, 6, 7]); /// let b = set_reversed_i16_m128i(0_i16, 1, 2, 3, 4, 5, 6, 7); /// assert_eq!(<[i16; 8]>::from(a), <[i16; 8]>::from(b)); /// ``` #[must_use] #[inline(always)] #[allow(clippy::too_many_arguments)] #[allow(clippy::many_single_char_names)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_reversed_i16_m128i(a: i16, b: i16, c: i16, d: i16, e: i16, f: i16, g: i16, h: i16) -> m128i { m128i(unsafe { _mm_setr_epi16(a, b, c, d, e, f, g, h) }) } /// Sets the args into an `m128i`, first arg is the low lane. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0, 1, 2, 3]); /// let b = set_reversed_i32_m128i(0, 1, 2, 3); /// assert_eq!(<[i32; 4]>::from(a), <[i32; 4]>::from(b)); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_reversed_i32_m128i(a: i32, b: i32, c: i32, d: i32) -> m128i { m128i(unsafe { _mm_setr_epi32(a, b, c, d) }) } /// Sets the args into an `m128d`, first arg is the low lane. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([0.0, 1.0]); /// let b = set_reversed_m128d(0.0, 1.0); /// assert_eq!(a.to_array(), b.to_array()); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn set_reversed_m128d(a: f64, b: f64) -> m128d { m128d(unsafe { _mm_setr_pd(a, b) }) } /// All lanes zero. /// ``` /// # use safe_arch::*; /// let a = zeroed_m128i(); /// assert_eq!(u128::from(a), 0); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn zeroed_m128i() -> m128i { m128i(unsafe { _mm_setzero_si128() }) } /// Both lanes zero. /// ``` /// # use safe_arch::*; /// let a = zeroed_m128d(); /// assert_eq!(a.to_array(), [0.0, 0.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn zeroed_m128d() -> m128d { m128d(unsafe { _mm_setzero_pd() }) } /// Shuffle the `i32` lanes in `$a` using an immediate /// control value. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([6, 7, 8, 9]); /// // /// let c = shuffle_ai_f32_all_m128i::<0b01_10_10_00>(a); /// assert_eq!(<[i32; 4]>::from(c), [6, 8, 8, 7]); /// ``` /// * **Intrinsic:** [`_mm_shuffle_epi32`] /// * **Assembly:** `pshufd xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shuffle_ai_f32_all_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_shuffle_epi32(a.0, MASK) }) } /// Shuffle the `f64` lanes from `$a` and `$b` together using an immediate /// control value. /// /// The `a:` and `b:` prefixes on the index selection values are literal tokens /// that you type. It helps keep clear what value comes from where. The first /// two output lanes come from `$a`, the second two output lanes come from `$b`. /// /// You can pass the same value as both arguments, but if you want to swizzle /// within only a single register and you have `avx` available consider using /// [`shuffle_ai_f64_all_m128d`] instead. You'll get much better performance. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.0]); /// let b = m128d::from_array([3.0, 4.0]); /// // /// let c = shuffle_abi_f64_all_m128d::<0b00>(a, b).to_array(); /// assert_eq!(c, [1.0, 3.0]); /// // /// let c = shuffle_abi_f64_all_m128d::<0b10>(a, b).to_array(); /// assert_eq!(c, [1.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shuffle_abi_f64_all_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_shuffle_pd(a.0, b.0, MASK) }) } /// Shuffle the high `i16` lanes in `$a` using an immediate control value. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, 5, 6, 7, 8]); /// let c = shuffle_ai_i16_h64all_m128i::<0b01_00_10_11>(a); /// assert_eq!(<[i16; 8]>::from(c), [1_i16, 2, 3, 4, 8, 7, 5, 6]); /// ``` /// * **Intrinsic:** [`_mm_shufflehi_epi16`] /// * **Assembly:** `pshufhw xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shuffle_ai_i16_h64all_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_shufflehi_epi16(a.0, MASK) }) } /// Shuffle the low `i16` lanes in `$a` using an immediate control value. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, 5, 6, 7, 8]); /// // /// let c = shuffle_ai_i16_l64all_m128i::<0b01_11_10_00>(a); /// assert_eq!(<[i16; 8]>::from(c), [1_i16, 3, 4, 2, 5, 6, 7, 8]); /// ``` /// * **Intrinsic:** [`_mm_shufflelo_epi16`] /// * **Assembly:** `pshuflw xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shuffle_ai_i16_l64all_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_shufflelo_epi16(a.0, MASK) }) } /// Shift all `u16` lanes to the left by the `count` in the lower `u64` lane. /// /// New bits are 0s. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u16, 2, 3, 4, 1, 2, 3, 4]); /// let b = m128i::from([3_u64, 0]); /// let c: [u16; 8] = shl_all_u16_m128i(a, b).into(); /// assert_eq!(c, [1_u16 << 3, 2 << 3, 3 << 3, 4 << 3, 1 << 3, 2 << 3, 3 << 3, 4 << 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shl_all_u16_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_sll_epi16(a.0, count.0) }) } /// Shift all `u32` lanes to the left by the `count` in the lower `u64` lane. /// /// New bits are 0s. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u32, 2, 3, 4]); /// let b = m128i::from([3_u64, 0]); /// let c: [u32; 4] = shl_all_u32_m128i(a, b).into(); /// assert_eq!(c, [1 << 3, 2 << 3, 3 << 3, 4 << 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shl_all_u32_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_sll_epi32(a.0, count.0) }) } /// Shift all `u64` lanes to the left by the `count` in the lower `u64` lane. /// /// New bits are 0s. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u64, 2]); /// let b = m128i::from([3_u64, 0]); /// let c: [u64; 2] = shl_all_u64_m128i(a, b).into(); /// assert_eq!(c, [1 << 3, 2 << 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shl_all_u64_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_sll_epi64(a.0, count.0) }) } /// Shifts all `u16` lanes left by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u16, 2, 3, 4, 1, 2, 3, 4]); /// let c: [u16; 8] = shl_imm_u16_m128i::<3>(a).into(); /// assert_eq!(c, [1_u16 << 3, 2 << 3, 3 << 3, 4 << 3, 1 << 3, 2 << 3, 3 << 3, 4 << 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shl_imm_u16_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_slli_epi16(a.0, IMM) }) } /// Shifts all `u32` lanes left by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let c: [u32; 4] = shl_imm_u32_m128i::<3>(a).into(); /// assert_eq!(c, [1 << 3, 2 << 3, 3 << 3, 4 << 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shl_imm_u32_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_slli_epi32(a.0, IMM) }) } /// Shifts both `u64` lanes left by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u64, 2]); /// let c: [u64; 2] = shl_imm_u64_m128i::<3>(a).into(); /// assert_eq!(c, [1_u64 << 3, 2 << 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shl_imm_u64_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_slli_epi64(a.0, IMM) }) } /// Lanewise `sqrt(a)`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([25.0, 16.0]); /// let b = sqrt_m128d(a).to_array(); /// assert_eq!(b, [5.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sqrt_m128d(a: m128d) -> m128d { m128d(unsafe { _mm_sqrt_pd(a.0) }) } /// Low lane `sqrt(b)`, upper lane is unchanged from `a`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 2.0]); /// let b = m128d::from_array([25.0, 4.0]); /// let c = sqrt_m128d_s(a, b); /// assert_eq!(c.to_array(), [5.0, 2.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sqrt_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_sqrt_sd(a.0, b.0) }) } /// Shift each `i16` lane to the right by the `count` in the lower `i64` lane. /// /// New bits are the sign bit. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([3_i64, 0]); /// let c: [i16; 8] = shr_all_i16_m128i(a, b).into(); /// assert_eq!(c, [1_i16 >> 3, 2 >> 3, 3 >> 3, 4 >> 3, -1 >> 3, -2 >> 3, -3 >> 3, -4 >> 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_all_i16_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_sra_epi16(a.0, count.0) }) } /// Shift each `i32` lane to the right by the `count` in the lower `i64` lane. /// /// New bits are the sign bit. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i32, 2, -3, -4]); /// let b = m128i::from([3_i64, 0]); /// let c: [i32; 4] = shr_all_i32_m128i(a, b).into(); /// assert_eq!(c, [1 >> 3, 2 >> 3, -3 >> 3, -4 >> 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_all_i32_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_sra_epi32(a.0, count.0) }) } /// Shifts all `i16` lanes right by an immediate. /// /// New bits are the sign bit. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let c: [i16; 8] = shr_imm_i16_m128i::<3>(a).into(); /// assert_eq!(c, [1_i16 >> 3, 2 >> 3, 3 >> 3, 4 >> 3, -1 >> 3, -2 >> 3, -3 >> 3, -4 >> 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_imm_i16_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_srai_epi16(a.0, IMM) }) } /// Shifts all `i32` lanes right by an immediate. /// /// New bits are the sign bit. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, -3, -4]); /// let c: [i32; 4] = shr_imm_i32_m128i::<3>(a).into(); /// assert_eq!(c, [1 >> 3, 2 >> 3, -3 >> 3, -4 >> 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_imm_i32_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_srai_epi32(a.0, IMM) }) } /// Shift each `u16` lane to the right by the `count` in the lower `u64` lane. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u16, 2, 3, 4, 100, 200, 300, 400]); /// let b = m128i::from([3_u64, 0]); /// let c: [u16; 8] = shr_all_u16_m128i(a, b).into(); /// assert_eq!(c, [1_u16 >> 3, 2 >> 3, 3 >> 3, 4 >> 3, 100 >> 3, 200 >> 3, 300 >> 3, 400 >> 3,]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_all_u16_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_srl_epi16(a.0, count.0) }) } /// Shift each `u32` lane to the right by the `count` in the lower `u64` lane. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u32, 2, 300, 400]); /// let b = m128i::from([3_u64, 0]); /// let c: [u32; 4] = shr_all_u32_m128i(a, b).into(); /// assert_eq!(c, [1 >> 3, 2 >> 3, 300 >> 3, 400 >> 3,]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_all_u32_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_srl_epi32(a.0, count.0) }) } /// Shift each `u64` lane to the right by the `count` in the lower `u64` lane. /// /// New bits are 0s. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u64, 56]); /// let b = m128i::from([3_u64, 0]); /// let c: [u64; 2] = shr_all_u64_m128i(a, b).into(); /// assert_eq!(c, [1 >> 3, 56 >> 3]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_all_u64_m128i(a: m128i, count: m128i) -> m128i { m128i(unsafe { _mm_srl_epi64(a.0, count.0) }) } /// Shifts all `u16` lanes right by an immediate. /// /// New bits are 0s. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u16, 2, 3, 4, 100, 200, 300, 400]); /// let c: [u16; 8] = shr_imm_u16_m128i::<3>(a).into(); /// assert_eq!(c, [1_u16 >> 3, 2 >> 3, 3 >> 3, 4 >> 3, 100 >> 3, 200 >> 3, 300 >> 3, 400 >> 3,]); /// ``` /// * **Intrinsic:** [`_mm_srli_epi16`] /// * **Assembly:** `psrlw xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_imm_u16_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_srli_epi16(a.0, IMM) }) } /// Shifts all `u32` lanes right by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 300, 400]); /// let c: [u32; 4] = shr_imm_u32_m128i::<3>(a).into(); /// assert_eq!(c, [1 >> 3, 2 >> 3, 300 >> 3, 400 >> 3]); /// ``` /// * **Intrinsic:** [`_mm_srli_epi32`] /// * **Assembly:** `psrld xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_imm_u32_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_srli_epi32(a.0, IMM) }) } /// Shifts both `u64` lanes right by an immediate. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u64, 200]); /// let c: [u64; 2] = shr_imm_u64_m128i::<3>(a).into(); /// assert_eq!(c, [1_u64 >> 3, 200 >> 3]); /// ``` /// * **Intrinsic:** [`_mm_srli_epi64`] /// * **Assembly:** `psrlq xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn shr_imm_u64_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_srli_epi64(a.0, IMM) }) } /// Stores the value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([10.0, 12.0]); /// let mut b = zeroed_m128d(); /// store_m128d(&mut b, a); /// let c = b.to_array(); /// assert_eq!(c, [10.0, 12.0]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn store_m128d(r: &mut m128d, a: m128d) { unsafe { _mm_store_pd(r as *mut m128d as *mut f64, a.0) } } /// Stores the low lane value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([10.0, 12.0]); /// let mut f = 0.0; /// store_m128d_s(&mut f, a); /// assert_eq!(f, 10.0); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn store_m128d_s(r: &mut f64, a: m128d) { unsafe { _mm_store_sd(r as *mut f64, a.0) } } /// Stores the low lane value to all lanes of the reference given. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([10.0, 12.0]); /// let mut b = zeroed_m128d(); /// store_splat_m128d(&mut b, a); /// let c = b.to_array(); /// assert_eq!(c, [10.0, 10.0]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn store_splat_m128d(r: &mut m128d, a: m128d) { unsafe { _mm_store1_pd(r as *mut m128d as *mut f64, a.0) } } /// Stores the value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let mut b = zeroed_m128i(); /// store_m128i(&mut b, a); /// let c: [i32; 4] = b.into(); /// assert_eq!(c, [1, 2, 3, 4]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn store_m128i(r: &mut m128i, a: m128i) { unsafe { _mm_store_si128(&mut r.0, a.0) } } /// Stores the high lane value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([10.0, 12.0]); /// let mut f = 0.0; /// store_high_m128d_s(&mut f, a); /// assert_eq!(f, 12.0); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn store_high_m128d_s(r: &mut f64, a: m128d) { unsafe { _mm_storeh_pd(r as *mut f64, a.0) } } /// Stores the value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i64, 2]); /// let mut b = 0_i64; /// store_i64_m128i_s(&mut b, a); /// assert_eq!(b, 1_i64); /// ``` #[inline(always)] #[allow(clippy::cast_ptr_alignment)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn store_i64_m128i_s(r: &mut i64, a: m128i) { unsafe { _mm_storel_epi64(r as *mut i64 as *mut __m128i, a.0) } } /// Stores the value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([10.0, 12.0]); /// let mut b = zeroed_m128d(); /// store_reversed_m128d(&mut b, a); /// let c = b.to_array(); /// assert_eq!(c, [12.0, 10.0]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn store_reversed_m128d(r: &mut m128d, a: m128d) { unsafe { _mm_storer_pd(r as *mut m128d as *mut f64, a.0) } } /// Stores the value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([10.0, 12.0]); /// let mut b = [0.0, 0.0]; /// store_unaligned_m128d(&mut b, a); /// assert_eq!(b, [10.0, 12.0]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn store_unaligned_m128d(r: &mut [f64; 2], a: m128d) { unsafe { _mm_storeu_pd(r.as_mut_ptr(), a.0) } } /// Stores the value to the reference given. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let mut b = [0_u8; 16]; /// store_unaligned_m128i(&mut b, a); /// assert_eq!(b, [0_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// ``` #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn store_unaligned_m128i(r: &mut [u8; 16], a: m128i) { unsafe { _mm_storeu_si128(r.as_mut_ptr().cast(), a.0) } } /// Lanewise `a - b` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b = m128i::from([0_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, -20, 21, 22, -23, 24, 127]); /// let c: [i8; 16] = sub_i8_m128i(a, b).into(); /// assert_eq!(c, [0, -10, 0, -10, 0, -10, 0, -10, 0, -10, 30, -10, -10, 36, -10, -112]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_sub_epi8(a.0, b.0) }) } /// Lanewise `a - b` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([51_i16, 61, 71, 81, -15, -26, -37, 48]); /// let c: [i16; 8] = sub_i16_m128i(a, b).into(); /// assert_eq!(c, [-50, -59, -68, -77, 14, 24, 34, -52]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_sub_epi16(a.0, b.0) }) } /// Lanewise `a - b` with lanes as `i32`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = m128i::from([50, 60, 70, 87]); /// let c: [i32; 4] = sub_i32_m128i(a, b).into(); /// assert_eq!(c, [-49, -58, -67, -83]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_sub_epi32(a.0, b.0) }) } /// Lanewise `a - b` with lanes as `i64`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([92_i64, 87]); /// let b = m128i::from([-9001_i64, 1]); /// let c: [i64; 2] = sub_i64_m128i(a, b).into(); /// assert_eq!(c, [9093, 86]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_i64_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_sub_epi64(a.0, b.0) }) } /// Lanewise `a - b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 87.5]); /// let b = m128d::from_array([100.0, -6.0]); /// let c = sub_m128d(a, b).to_array(); /// assert_eq!(c, [-8.0, 93.5]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_sub_pd(a.0, b.0) }) } /// Lowest lane `a - b`, high lane unchanged. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 87.5]); /// let b = m128d::from_array([100.0, -600.0]); /// let c = sub_m128d_s(a, b).to_array(); /// assert_eq!(c, [-8.0, 87.5]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_sub_sd(a.0, b.0) }) } /// Lanewise saturating `a - b` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, -128, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -127]); /// let b = m128i::from([0_i8, 1, 2, 13, 4, 15, 6, 17, 8, 19, -20, 21, 22, -23, 24, 127]); /// let c: [i8; 16] = sub_saturating_i8_m128i(a, b).into(); /// assert_eq!(c, [0, -128, 0, -10, 0, -10, 0, -10, 0, -10, 30, -10, -10, 36, -10, -128]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_saturating_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_subs_epi8(a.0, b.0) }) } /// Lanewise saturating `a - b` with lanes as `i16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([51_i16, 61, 71, 81, i16::MAX, -26, -37, 48]); /// let c: [i16; 8] = sub_saturating_i16_m128i(a, b).into(); /// assert_eq!(c, [-50, -59, -68, -77, -32768, 24, 34, -52]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_saturating_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_subs_epi16(a.0, b.0) }) } /// Lanewise saturating `a - b` with lanes as `u8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([10_u8, 255, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 255]); /// let b = m128i::from([1_u8, 1, 2, 13, 4, 15, 6, 17, 8, 19, 20, 21, 22, 23, 24, 127]); /// let c: [u8; 16] = sub_saturating_u8_m128i(a, b).into(); /// assert_eq!(c, [9_u8, 254, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_saturating_u8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_subs_epu8(a.0, b.0) }) } /// Lanewise saturating `a - b` with lanes as `u16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([51_u16, 61, 3, 4, u16::MAX, 2, 3, u16::MAX]); /// let b = m128i::from([5_u16, 2, 71, 81, u16::MAX, 26, 37, u16::MIN]); /// let c: [u16; 8] = sub_saturating_u16_m128i(a, b).into(); /// assert_eq!(c, [46, 59, 0, 0, 0, 0, 0, u16::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn sub_saturating_u16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_subs_epu16(a.0, b.0) }) } /// Unpack and interleave high `i8` lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b = m128i::from([0_i8, 11, 2, 13, 4, 15, 6, 17, 8, 19, -20, 21, 22, -23, 24, 127]); /// let c: [i8; 16] = unpack_high_i8_m128i(a, b).into(); /// assert_eq!(c, [8, 8, 9, 19, 10, -20, 11, 21, 12, 22, 13, -23, 14, 24, 15, 127]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_high_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_unpackhi_epi8(a.0, b.0) }) } /// Unpack and interleave high `i16` lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([5_i16, 6, 7, 8, -15, -26, -37, 48]); /// let c: [i16; 8] = unpack_high_i16_m128i(a, b).into(); /// assert_eq!(c, [-1, -15, -2, -26, -3, -37, -4, 48]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_high_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_unpackhi_epi16(a.0, b.0) }) } /// Unpack and interleave high `i32` lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = m128i::from([5, 6, 7, 8]); /// let c: [i32; 4] = unpack_high_i32_m128i(a, b).into(); /// assert_eq!(c, [3, 7, 4, 8]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_high_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_unpackhi_epi32(a.0, b.0) }) } /// Unpack and interleave high `i64` lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([92_i64, 87]); /// let b = m128i::from([-9001_i64, 1]); /// let c: [i64; 2] = unpack_high_i64_m128i(a, b).into(); /// assert_eq!(c, [87, 1]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_high_i64_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_unpackhi_epi64(a.0, b.0) }) } /// Unpack and interleave high lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 87.5]); /// let b = m128d::from_array([100.0, -6.0]); /// let c = unpack_high_m128d(a, b).to_array(); /// assert_eq!(c, [87.5, -6.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_high_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_unpackhi_pd(a.0, b.0) }) } /// Unpack and interleave low `i8` lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b = m128i::from([12_i8, 11, 22, 13, 99, 15, 16, 17, 8, 19, -20, 21, 22, -23, 24, 127]); /// let c: [i8; 16] = unpack_low_i8_m128i(a, b).into(); /// assert_eq!(c, [0, 12, 1, 11, 2, 22, 3, 13, 4, 99, 5, 15, 6, 16, 7, 17]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_low_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_unpacklo_epi8(a.0, b.0) }) } /// Unpack and interleave low `i16` lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); /// let b = m128i::from([5_i16, 6, 7, 8, -15, -26, -37, 48]); /// let c: [i16; 8] = unpack_low_i16_m128i(a, b).into(); /// assert_eq!(c, [1, 5, 2, 6, 3, 7, 4, 8]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_low_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_unpacklo_epi16(a.0, b.0) }) } /// Unpack and interleave low `i32` lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = m128i::from([5, 6, 7, 8]); /// let c: [i32; 4] = unpack_low_i32_m128i(a, b).into(); /// assert_eq!(c, [1, 5, 2, 6]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_low_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_unpacklo_epi32(a.0, b.0) }) } /// Unpack and interleave low `i64` lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([92_i64, 87]); /// let b = m128i::from([-9001_i64, 1]); /// let c: [i64; 2] = unpack_low_i64_m128i(a, b).into(); /// assert_eq!(c, [92, -9001]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_low_i64_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_unpacklo_epi64(a.0, b.0) }) } /// Unpack and interleave low lanes of `a` and `b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([92.0, 87.5]); /// let b = m128d::from_array([100.0, -6.0]); /// let c = unpack_low_m128d(a, b).to_array(); /// assert_eq!(c, [92.0, 100.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn unpack_low_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_unpacklo_pd(a.0, b.0) }) } /// Bitwise `a ^ b`. /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([1.0, 0.0]); /// let b = m128d::from_array([1.0, 1.0]); /// let c = bitxor_m128d(a, b).to_array(); /// assert_eq!(c, [0.0, 1.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn bitxor_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_xor_pd(a.0, b.0) }) } /// Bitwise `a ^ b`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 0, 1, 0]); /// let b = m128i::from([1, 1, 0, 0]); /// let c: [i32; 4] = bitxor_m128i(a, b).into(); /// assert_eq!(c, [0, 1, 1, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse2")))] pub fn bitxor_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_xor_si128(a.0, b.0) }) } // // Here we define the Operator Overloads for `m128`. Each one just calls the // correct function from above. By putting the impls here and not with the // `m128` type we theoretically would be able to build the crate safely even if // there's no `sse` feature enabled. You'd just have a `m128` type without the // operator overloads is all. Not that the standard Rust distribution can build // properly without `sse` enabled, but maybe you're using a custom target or // something. It doesn't really put us out of our way, so it doesn't hurt to try // and accommodate the potential use case. // // First we provide all `m128d` impls. impl Add for m128d { type Output = Self; #[must_use] #[inline(always)] fn add(self, rhs: Self) -> Self { add_m128d(self, rhs) } } impl AddAssign for m128d { #[inline(always)] fn add_assign(&mut self, rhs: Self) { *self = *self + rhs; } } impl BitAnd for m128d { type Output = Self; #[must_use] #[inline(always)] fn bitand(self, rhs: Self) -> Self { bitand_m128d(self, rhs) } } impl BitAndAssign for m128d { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = *self & rhs; } } impl BitOr for m128d { type Output = Self; #[must_use] #[inline(always)] fn bitor(self, rhs: Self) -> Self { bitor_m128d(self, rhs) } } impl BitOrAssign for m128d { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = *self | rhs; } } impl BitXor for m128d { type Output = Self; #[must_use] #[inline(always)] fn bitxor(self, rhs: Self) -> Self { bitxor_m128d(self, rhs) } } impl BitXorAssign for m128d { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = *self ^ rhs; } } impl Div for m128d { type Output = Self; #[must_use] #[inline(always)] fn div(self, rhs: Self) -> Self { div_m128d(self, rhs) } } impl DivAssign for m128d { #[inline(always)] fn div_assign(&mut self, rhs: Self) { *self = *self / rhs; } } impl Mul for m128d { type Output = Self; #[must_use] #[inline(always)] fn mul(self, rhs: Self) -> Self { mul_m128d(self, rhs) } } impl MulAssign for m128d { #[inline(always)] fn mul_assign(&mut self, rhs: Self) { *self = *self * rhs; } } impl Neg for m128d { type Output = Self; #[must_use] #[inline(always)] fn neg(self) -> Self { sub_m128d(zeroed_m128d(), self) } } impl Not for m128d { type Output = Self; /// Not a direct intrinsic, but it's very useful and the implementation is /// simple enough. /// /// Negates the bits by performing an `xor` with an all-1s bit pattern. #[must_use] #[inline(always)] fn not(self) -> Self { let all_bits = set_splat_m128d(f64::from_bits(u64::MAX)); self ^ all_bits } } impl Sub for m128d { type Output = Self; #[must_use] #[inline(always)] fn sub(self, rhs: Self) -> Self { sub_m128d(self, rhs) } } impl SubAssign for m128d { #[inline(always)] fn sub_assign(&mut self, rhs: Self) { *self = *self - rhs; } } impl PartialEq for m128d { /// Not a direct intrinsic, this is a `cmp_eq_mask` and then a `move_mask`. #[must_use] #[inline(always)] fn eq(&self, other: &Self) -> bool { move_mask_m128d(cmp_eq_mask_m128d(*self, *other)) == 0b11 } } // Next we provide all `m128i` impls. Since the interpretation of the lanes // depends on the operation used, we only provide the bit ops (which are "lane // agnostic"). impl BitAnd for m128i { type Output = Self; #[must_use] #[inline(always)] fn bitand(self, rhs: Self) -> Self { bitand_m128i(self, rhs) } } impl BitAndAssign for m128i { #[inline(always)] fn bitand_assign(&mut self, rhs: Self) { *self = *self & rhs; } } impl BitOr for m128i { type Output = Self; #[must_use] #[inline(always)] fn bitor(self, rhs: Self) -> Self { bitor_m128i(self, rhs) } } impl BitOrAssign for m128i { #[inline(always)] fn bitor_assign(&mut self, rhs: Self) { *self = *self | rhs; } } impl BitXor for m128i { type Output = Self; #[must_use] #[inline(always)] fn bitxor(self, rhs: Self) -> Self { bitxor_m128i(self, rhs) } } impl BitXorAssign for m128i { #[inline(always)] fn bitxor_assign(&mut self, rhs: Self) { *self = *self ^ rhs; } } impl Not for m128i { type Output = Self; /// Not a direct intrinsic, but it's very useful and the implementation is /// simple enough. /// /// Negates the bits by performing an `xor` with an all-1s bit pattern. #[must_use] #[inline(always)] fn not(self) -> Self { let all_bits = set_splat_i32_m128i(-1); self ^ all_bits } } impl PartialEq for m128i { /// Not a direct intrinsic, this is a `cmp_eq_mask_i8_m128i` and then a /// `move_mask_i8_m128i`. #[must_use] #[inline(always)] fn eq(&self, other: &Self) -> bool { move_mask_i8_m128i(cmp_eq_mask_i8_m128i(*self, *other)) == 0b11111111_11111111 } } /// Unlike with the floating types, ints have absolute equality. impl Eq for m128i {} safe_arch-0.7.1/src/x86_x64/sse3.rs000066400000000000000000000056571445526200400166110ustar00rootroot00000000000000#![cfg(target_feature = "sse3")] use super::*; /// Add the high lane and subtract the low lane. /// /// * **Intrinsic:** [`_mm_addsub_pd`] /// * **Assembly:** `addsubpd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse3")))] pub fn addsub_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_addsub_pd(a.0, b.0) }) } /// Alternately, from the top, add a lane and then subtract a lane. /// /// * **Intrinsic:** [`_mm_addsub_ps`] /// * **Assembly:** `addsubps xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse3")))] pub fn addsub_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_addsub_ps(a.0, b.0) }) } /// Add each lane horizontally, pack the outputs as `a` then `b`. /// /// * **Intrinsic:** [`_mm_hadd_pd`] /// * **Assembly:** `haddpd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse3")))] pub fn add_horizontal_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_hadd_pd(a.0, b.0) }) } /// Add each lane horizontally, pack the outputs as `a` then `b`. /// /// * **Intrinsic:** [`_mm_hadd_ps`] /// * **Assembly:** `haddps xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse3")))] pub fn add_horizontal_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_hadd_ps(a.0, b.0) }) } /// Subtract each lane horizontally, pack the outputs as `a` then `b`. /// /// * **Intrinsic:** [`_mm_hsub_pd`] /// * **Assembly:** `hsubpd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse3")))] pub fn sub_horizontal_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_hsub_pd(a.0, b.0) }) } /// Subtract each lane horizontally, pack the outputs as `a` then `b`. /// /// * **Intrinsic:** [`_mm_hsub_ps`] /// * **Assembly:** `hsubps xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse3")))] pub fn sub_horizontal_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_hsub_ps(a.0, b.0) }) } /// Copy the low lane of the input to both lanes of the output. /// /// * **Intrinsic:** [`_mm_movedup_pd`] /// * **Assembly:** `movddup xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse3")))] pub fn duplicate_low_lane_m128d_s(a: m128d) -> m128d { m128d(unsafe { _mm_movedup_pd(a.0) }) } /// Duplicate the odd lanes to the even lanes. /// /// * **Intrinsic:** [`_mm_movehdup_ps`] /// * **Assembly:** `movshdup xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse3")))] pub fn duplicate_odd_lanes_m128(a: m128) -> m128 { m128(unsafe { _mm_movehdup_ps(a.0) }) } /// Duplicate the odd lanes to the even lanes. /// /// * **Intrinsic:** [`_mm_moveldup_ps`] /// * **Assembly:** `movsldup xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse3")))] pub fn duplicate_even_lanes_m128(a: m128) -> m128 { m128(unsafe { _mm_moveldup_ps(a.0) }) } safe_arch-0.7.1/src/x86_x64/sse4_1.rs000066400000000000000000000720661445526200400170300ustar00rootroot00000000000000#![cfg(target_feature = "sse4.1")] use super::*; /// Blends the `i16` lanes according to the immediate mask. /// /// Each bit 0 though 7 controls lane 0 through 7. Use 0 for the `a` value and /// 1 for the `b` value. /// /// * **Intrinsic:** [`_mm_blend_epi16`] /// * **Assembly:** `pblendw xmm, xmm, imm8` pub fn blend_imm_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_blend_epi16(a.0, b.0, IMM) }) } /// Blends the `i16` lanes according to the immediate mask. /// /// Bits 0 and 1 control where output lane 0 and 1 come from. Use 0 for the `a` /// value and 1 for the `b` value. /// /// * **Intrinsic:** [`_mm_blend_pd`] /// * **Assembly:** `blendpd xmm, xmm, imm8` pub fn blend_imm_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_blend_pd(a.0, b.0, IMM) }) } /// Blends the lanes according to the immediate mask. /// /// Bits 0 to 3 control where output lane 0 to 3 come from. Use 0 for the `a` /// value and 1 for the `b` value. /// /// * **Intrinsic:** [`_mm_blend_ps`] /// * **Assembly:** `blendps xmm, xmm, imm8` pub fn blend_imm_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_blend_ps(a.0, b.0, IMM) }) } /// Blend the `i8` lanes according to a runtime varying mask. /// /// The sign bit of each `i8` lane in the `mask` value determines if the output /// lane uses `a` (mask non-negative) or `b` (mask negative). /// /// * **Intrinsic:** [`_mm_blendv_epi8`] /// * **Assembly:** `pblendvb xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn blend_varying_i8_m128i(a: m128i, b: m128i, mask: m128i) -> m128i { m128i(unsafe { _mm_blendv_epi8(a.0, b.0, mask.0) }) } /// Blend the lanes according to a runtime varying mask. /// /// The sign bit of each lane in the `mask` value determines if the output /// lane uses `a` (mask non-negative) or `b` (mask negative). /// /// * **Intrinsic:** [`_mm_blendv_pd`] /// * **Assembly:** `blendvpd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn blend_varying_m128d(a: m128d, b: m128d, mask: m128d) -> m128d { m128d(unsafe { _mm_blendv_pd(a.0, b.0, mask.0) }) } /// Blend the lanes according to a runtime varying mask. /// /// The sign bit of each lane in the `mask` value determines if the output /// lane uses `a` (mask non-negative) or `b` (mask negative). /// /// * **Intrinsic:** [`_mm_blendv_ps`] /// * **Assembly:** `blendvps xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn blend_varying_m128(a: m128, b: m128, mask: m128) -> m128 { m128(unsafe { _mm_blendv_ps(a.0, b.0, mask.0) }) } /// Round each lane to a whole number, towards positive infinity. /// /// * **Intrinsic:** [`_mm_ceil_pd`] /// * **Assembly:** `roundpd xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn ceil_m128d(a: m128d) -> m128d { m128d(unsafe { _mm_ceil_pd(a.0) }) } /// Round each lane to a whole number, towards positive infinity. /// /// * **Intrinsic:** [`_mm_ceil_ps`] /// * **Assembly:** `roundps xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn ceil_m128(a: m128) -> m128 { m128(unsafe { _mm_ceil_ps(a.0) }) } /// Round the low lane of `b` toward positive infinity, high lane is `a`. /// /// * **Intrinsic:** [`_mm_ceil_sd`] /// * **Assembly:** `roundsd xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn ceil_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_ceil_sd(a.0, b.0) }) } /// Round the low lane of `b` toward positive infinity, other lanes `a`. /// /// * **Intrinsic:** [`_mm_ceil_ss`] /// * **Assembly:** `roundss xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn ceil_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_ceil_ss(a.0, b.0) }) } /// Lanewise `a == b` with lanes as `i64`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// /// * **Intrinsic:** [`_mm_cmpeq_epi64`] /// * **Assembly:** `pcmpeqq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn cmp_eq_mask_i64_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmpeq_epi64(a.0, b.0) }) } /// Convert the lower four `i16` lanes to four `i32` lanes. /// /// * **Intrinsic:** [`_mm_cvtepi16_epi32`] /// * **Assembly:** `pmovsxwd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_i32_m128i_from_lower4_i16_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepi16_epi32(a.0) }) } /// Convert the lower two `i64` lanes to two `i32` lanes. /// /// * **Intrinsic:** [`_mm_cvtepi16_epi64`] /// * **Assembly:** `pmovsxwq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_i16_m128i_from_lower2_i16_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepi16_epi64(a.0) }) } /// Convert the lower two `i32` lanes to two `i64` lanes. /// /// * **Intrinsic:** [`_mm_cvtepi32_epi64`] /// * **Assembly:** `_mm_cvtepi32_epi64` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_i64_m128i_from_lower2_i32_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepi32_epi64(a.0) }) } /// Convert the lower eight `i8` lanes to eight `i16` lanes. /// /// * **Intrinsic:** [`_mm_cvtepi8_epi16`] /// * **Assembly:** `pmovsxbw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_i16_m128i_from_lower8_i8_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepi8_epi16(a.0) }) } /// Convert the lower four `i8` lanes to four `i32` lanes. /// /// * **Intrinsic:** [`_mm_cvtepi8_epi32`] /// * **Assembly:** `pmovsxbd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_i32_m128i_from_lower4_i8_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepi8_epi32(a.0) }) } /// Convert the lower two `i8` lanes to two `i64` lanes. /// /// * **Intrinsic:** [`_mm_cvtepi8_epi64`] /// * **Assembly:** `pmovsxbq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_i64_m128i_from_lower2_i8_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepi8_epi64(a.0) }) } /// Convert the lower four `u16` lanes to four `u32` lanes. /// /// * **Intrinsic:** [`_mm_cvtepu16_epi32`] /// * **Assembly:** `pmovzxwd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_u32_m128i_from_lower4_u16_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepu16_epi32(a.0) }) } /// Convert the lower two `u16` lanes to two `u64` lanes. /// /// * **Intrinsic:** [`_mm_cvtepu16_epi64`] /// * **Assembly:** `pmovzxwq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_u64_m128i_from_lower2_u16_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepu16_epi64(a.0) }) } /// Convert the lower two `u32` lanes to two `u64` lanes. /// /// * **Intrinsic:** [`_mm_cvtepu32_epi64`] /// * **Assembly:** `pmovzxdq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_u64_m128i_from_lower2_u32_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepu32_epi64(a.0) }) } /// Convert the lower eight `u8` lanes to eight `u16` lanes. /// /// * **Intrinsic:** [`_mm_cvtepu8_epi16`] /// * **Assembly:** `pmovzxbw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_u16_m128i_from_lower8_u8_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepu8_epi16(a.0) }) } /// Convert the lower four `u8` lanes to four `u32` lanes. /// /// * **Intrinsic:** [`_mm_cvtepu8_epi32`] /// * **Assembly:** `pmovzxbd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_u32_m128i_from_lower4_u8_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepu8_epi32(a.0) }) } /// Convert the lower two `u8` lanes to two `u64` lanes. /// /// * **Intrinsic:** [`_mm_cvtepu8_epi64`] /// * **Assembly:** `pmovzxbq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn convert_to_u64_m128i_from_lower2_u8_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_cvtepu8_epi64(a.0) }) } /// Performs a dot product of two `m128d` registers. /// /// The output details are determined by the constant: /// * For each lane, you can multiply that lane from `a` and `b` or you can take /// a default of 0.0 /// * Bits 4 and 5 determines if we mul lanes 0 in `a` and `b`, and lanes 1 in /// `a` and `b`. /// * This forms two temporary `f64` values which are summed to a single `f64`. /// * For each output lane, you can have the sum in that lane or 0.0. /// * Bits 0 and 1 determines if an output lane is our sum or 0.0. /// /// * **Intrinsic:** [`_mm_dp_pd`] /// * **Assembly:** `dppd xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn dot_product_m128d(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_dp_pd(a.0, b.0, IMM) }) } /// Performs a dot product of two `m128` registers. /// /// The output details are determined by a control mask: /// * For each lane, you can multiply that lane from `a` and `b` or you can take /// a default of 0.0 /// * Bits 4 through 7 determine if we should mul lanes 0 through 3. /// * This forms four temporary `f32` values which are summed to a single `f32`. /// * For each output lane, you can have the sum in that lane or 0.0. /// * Bits 0 through 3 determines if the `sum` is in lanes 0 through 3. /// /// * **Intrinsic:** [`_mm_dp_ps`] /// * **Assembly:** `dpps xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn dot_product_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_dp_ps(a.0, b.0, IMM) }) } /// Gets the `i32` lane requested. Only the lowest 2 bits are considered. /// /// * **Intrinsic:** [`_mm_extract_epi32`] /// * **Assembly:** `pextrd r32, xmm, imm8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn extract_i32_imm_m128i(a: m128i) -> i32 { unsafe { _mm_extract_epi32(a.0, IMM) } } /// Gets the `i64` lane requested. Only the lowest bit is considered. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([5_i64, 6]); /// assert_eq!(extract_i64_imm_m128i::<1>(a), 6_i64); /// ``` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn extract_i64_imm_m128i(a: m128i) -> i64 { unsafe { _mm_extract_epi64(a.0, IMM) } } /// Gets the `i8` lane requested. Only the lowest 4 bits are considered. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 101, 8, 9, 10, 11, 12, 13, 14, 15]); /// assert_eq!(extract_i8_as_i32_imm_m128i::<7>(a), 101_i32); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn extract_i8_as_i32_imm_m128i(a: m128i) -> i32 { unsafe { _mm_extract_epi8(a.0, IMM) } } /// Gets the `f32` lane requested. Returns as an `i32` bit pattern. /// /// ``` /// # use safe_arch::*; /// let a = m128::from_array([5.0, 6.0, 7.0, 8.0]); /// assert_eq!(extract_f32_as_i32_bits_imm_m128::<3>(a), 8_f32.to_bits() as i32); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn extract_f32_as_i32_bits_imm_m128(a: m128) -> i32 { unsafe { _mm_extract_ps(a.0, IMM) } } /// Round each lane to a whole number, towards negative infinity /// /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([-0.1, 1.8]); /// assert_eq!(floor_m128d(a).to_array(), [-1.0, 1.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn floor_m128d(a: m128d) -> m128d { m128d(unsafe { _mm_floor_pd(a.0) }) } /// Round each lane to a whole number, towards negative infinity /// /// ``` /// # use safe_arch::*; /// let a = m128::from_array([-0.1, 1.8, 2.5, 3.0]); /// assert_eq!(floor_m128(a).to_array(), [-1.0, 1.0, 2.0, 3.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn floor_m128(a: m128) -> m128 { m128(unsafe { _mm_floor_ps(a.0) }) } /// Round the low lane of `b` toward negative infinity, high lane is `a`. /// /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([-0.1, 1.8]); /// let b = m128d::from_array([2.5, 3.0]); /// assert_eq!(floor_m128d_s(a, b).to_array(), [2.0, 1.8]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn floor_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_floor_sd(a.0, b.0) }) } /// Round the low lane of `b` toward negative infinity, other lanes `a`. /// /// ``` /// # use safe_arch::*; /// let a = m128::from_array([-0.1, 1.8, 5.0, 6.0]); /// let b = m128::from_array([2.5, 3.0, 10.0, 20.0]); /// assert_eq!(floor_m128_s(a, b).to_array(), [2.0, 1.8, 5.0, 6.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn floor_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_floor_ss(a.0, b.0) }) } /// Inserts a new value for the `i32` lane specified. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([5, 6, 7, 8]); /// let b: [i32; 4] = insert_i32_imm_m128i::<1>(a, 23).into(); /// assert_eq!(b, [5, 23, 7, 8]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn insert_i32_imm_m128i(a: m128i, new: i32) -> m128i { m128i(unsafe { _mm_insert_epi32(a.0, new, IMM) }) } /// Inserts a new value for the `i64` lane specified. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([5_i64, 6]); /// let b: [i64; 2] = insert_i64_imm_m128i::<1>(a, 23).into(); /// assert_eq!(b, [5_i64, 23]); /// ``` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn insert_i64_imm_m128i(a: m128i, new: i64) -> m128i { m128i(unsafe { _mm_insert_epi64(a.0, new, IMM) }) } /// Inserts a new value for the `i64` lane specified. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// let b: [i8; 16] = insert_i8_imm_m128i::<1>(a, 23).into(); /// assert_eq!(b, [0_i8, 23, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn insert_i8_imm_m128i(a: m128i, new: i32) -> m128i { m128i(unsafe { _mm_insert_epi8(a.0, new, IMM) }) } /// Inserts a lane from `$b` into `$a`, optionally at a new position. /// /// Also, you can zero out any lanes you like for free as part of the same /// operation. If you don't specify the mask argument then no lanes are zeroed. /// /// ``` /// # use safe_arch::*; /// let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); /// let b = m128::from_array([5.0, 6.0, 7.0, 8.0]); /// // /// let c = insert_f32_imm_m128::<0b00_11_0000>(a, b).to_array(); /// assert_eq!(c, [1.0, 2.0, 3.0, 5.0]); /// // /// let c = insert_f32_imm_m128::<0b00_11_0110>(a, b).to_array(); /// assert_eq!(c, [1.0, 0.0, 0.0, 5.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn insert_f32_imm_m128(a: m128, b: m128) -> m128 { m128(unsafe { _mm_insert_ps(a.0, b.0, IMM) }) } /// Lanewise `max(a, b)` with lanes as `i32`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = m128i::from([5, 6, -7, 8]); /// let c: [i32; 4] = max_i32_m128i(a, b).into(); /// assert_eq!(c, [5, 6, 3, 8]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn max_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_max_epi32(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]); /// let b = m128i::from([0_i8, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, 24, 127]); /// let c: [i8; 16] = max_i8_m128i(a, b).into(); /// assert_eq!(c, [0, 11, 2, 3, 4, 15, 6, 7, 8, 19, 10, 21, 22, 13, 24, 127]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn max_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_max_epi8(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `u16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u16, 2, 300, 400, 1, 2, 3, 4]); /// let b = m128i::from([5_u16, 6, 7, 8, 15, 26, 37, 48]); /// let c: [u16; 8] = max_u16_m128i(a, b).into(); /// assert_eq!(c, [5_u16, 6, 300, 400, 15, 26, 37, 48]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn max_u16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_max_epu16(a.0, b.0) }) } /// Lanewise `max(a, b)` with lanes as `u32`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 200, 3, 4]); /// let b = m128i::from([5, 6, 7, 8]); /// let c: [u32; 4] = max_u32_m128i(a, b).into(); /// assert_eq!(c, [5, 200, 7, 8]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn max_u32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_max_epu32(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `i32`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = m128i::from([5, 6, -7, 8]); /// let c: [i32; 4] = min_i32_m128i(a, b).into(); /// assert_eq!(c, [1, 2, -7, 4]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn min_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_min_epi32(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `i8`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 127]); /// let b = m128i::from([0_i8, 11, 2, -13, 4, 15, 6, -17, -8, 19, -20, 21, 22, -23, 24, 127]); /// let c: [i8; 16] = min_i8_m128i(a, b).into(); /// assert_eq!(c, [0_i8, 1, 2, -13, 4, 5, 6, -17, -8, 9, -20, 11, 12, -23, 14, 127]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn min_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_min_epi8(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `u16`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1_u16, 2, 300, 400, 1, 2, 3, 4]); /// let b = m128i::from([5_u16, 6, 7, 8, 15, 26, 37, 48]); /// let c: [u16; 8] = min_u16_m128i(a, b).into(); /// assert_eq!(c, [1_u16, 2, 7, 8, 1, 2, 3, 4]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn min_u16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_min_epu16(a.0, b.0) }) } /// Lanewise `min(a, b)` with lanes as `u32`. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 200, 3, 4]); /// let b = m128i::from([5, 6, 7, 8]); /// let c: [u32; 4] = min_u32_m128i(a, b).into(); /// assert_eq!(c, [1, 6, 3, 4]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn min_u32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_min_epu32(a.0, b.0) }) } /// Min `u16` value, position, and other lanes zeroed. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([120_u16, 24, 300, 400, 90, 129, 31, 114]); /// let c: [u16; 8] = min_position_u16_m128i(a).into(); /// assert_eq!(c, [24_u16, 1, 0, 0, 0, 0, 0, 0]); /// /// // the position favors the leftmost minimum /// let a = m128i::from([120_u16, 24, 24, 400, 90, 129, 31, 114]); /// let c: [u16; 8] = min_position_u16_m128i(a).into(); /// assert_eq!(c, [24_u16, 1, 0, 0, 0, 0, 0, 0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn min_position_u16_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_minpos_epu16(a.0) }) } /// Computes eight `u16` "sum of absolute difference" values according to the /// bytes selected. /// /// * `a` can be 0 or 1, and specifies to skip the first fur `$a` values or not. /// * `b` can be 0, 1, 2, or 3 and specifies to skip the first four times that /// many values in `$b`. /// /// This is used for some HD codec thing, and I don't really get what the point /// is, but I'm sure someone uses it. If you can write better docs about what /// this does please file a PR. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([0_u8, 1, 56, 3, 255, 5, 127, 7, 128, 9, 100, 101, 123, 13, 154, 125]); /// let b = m128i::from([12_u8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); /// // /// let c: [u16; 8] = multi_packed_sum_abs_diff_u8_m128i::<0b00_00>(a, b).into(); /// assert_eq!(c, [66, 319, 301, 390, 376, 263, 253, 236]); /// // /// let c: [u16; 8] = multi_packed_sum_abs_diff_u8_m128i::<0b00_01>(a, b).into(); /// assert_eq!(c, [62, 305, 305, 372, 372, 245, 249, 222]); /// // /// let c: [u16; 8] = multi_packed_sum_abs_diff_u8_m128i::<0b00_10>(a, b).into(); /// assert_eq!(c, [70, 305, 305, 372, 372, 241, 241, 210]); /// // /// let c: [u16; 8] = multi_packed_sum_abs_diff_u8_m128i::<0b00_11>(a, b).into(); /// assert_eq!(c, [78, 305, 305, 372, 372, 241, 241, 210]); /// // /// let c: [u16; 8] = multi_packed_sum_abs_diff_u8_m128i::<0b01_00>(a, b).into(); /// assert_eq!(c, [376, 263, 253, 236, 320, 321, 319, 373]); /// // /// let c: [u16; 8] = multi_packed_sum_abs_diff_u8_m128i::<0b01_01>(a, b).into(); /// assert_eq!(c, [372, 245, 249, 222, 316, 311, 315, 369]); /// // /// let c: [u16; 8] = multi_packed_sum_abs_diff_u8_m128i::<0b01_10>(a, b).into(); /// assert_eq!(c, [372, 241, 241, 210, 300, 295, 299, 353]); /// // /// let c: [u16; 8] = multi_packed_sum_abs_diff_u8_m128i::<0b01_11>(a, b).into(); /// assert_eq!(c, [372, 241, 241, 210, 292, 285, 287, 339]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn multi_packed_sum_abs_diff_u8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_mpsadbw_epu8(a.0, b.0, IMM) }) } /// Multiplies the odd `i32` lanes and gives the widened (`i64`) results. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 7, i32::MAX, 7]); /// let b = m128i::from([-5, 7, i32::MAX, 7]); /// let c: [i64; 2] = mul_widen_i32_odd_m128i(a, b).into(); /// assert_eq!(c, [(-1 * 5), (i32::MAX as i64 * i32::MAX as i64)]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn mul_widen_i32_odd_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_mul_epi32(a.0, b.0) }) } /// Lanewise `a * b` with 32-bit lanes. /// /// This keeps the low 32-bits from each 64-bit output, /// so it actually works for both `i32` and `u32`. /// ``` /// # use safe_arch::*; /// let ai = m128i::from([1, 2000000, -300, 45689]); /// let bi = m128i::from([5, 6000000, 700, -89109]); /// let ci: [i32; 4] = mul_32_m128i(ai, bi).into(); /// assert_eq!(ci, [5, -138625024, -210000, 223666195]); /// /// let au = m128i::from([u32::MAX, 26, 5678, 1234567890]); /// let bu = m128i::from([u32::MAX, 74, 9101112, 765]); /// let cu: [u32; 4] = mul_32_m128i(au, bu).into(); /// assert_eq!(cu, [1, 1924, 136506384, 3846598026]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn mul_32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_mullo_epi32(a.0, b.0) }) } /// Saturating convert `i32` to `u16`, and pack the values. /// ``` /// # use safe_arch::*; /// let a = m128i::from([1, 2, 3, 4]); /// let b = m128i::from([9, -10, -11, i32::MAX]); /// let c: [u16; 8] = pack_i32_to_u16_m128i(a, b).into(); /// assert_eq!(c, [1, 2, 3, 4, 9, 0, 0, u16::MAX]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn pack_i32_to_u16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_packus_epi32(a.0, b.0) }) } /// Rounds each lane in the style specified. /// /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([-0.1, 1.6]); /// // /// assert_eq!(round_m128d::<{ round_op!(Nearest) }>(a).to_array(), [0.0, 2.0]); /// // /// assert_eq!(round_m128d::<{ round_op!(NegInf) }>(a).to_array(), [-1.0, 1.0]); /// // /// assert_eq!(round_m128d::<{ round_op!(PosInf) }>(a).to_array(), [0.0, 2.0]); /// // /// assert_eq!(round_m128d::<{ round_op!(Zero) }>(a).to_array(), [0.0, 1.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn round_m128d(a: m128d) -> m128d { m128d(unsafe { _mm_round_pd(a.0, MODE) }) } /// Rounds `$b` low as specified, keeps `$a` high. /// /// ``` /// # use safe_arch::*; /// let a = m128d::from_array([f64::NAN, 900.0]); /// // /// let b = m128d::from_array([-0.1, f64::NAN]); /// // /// assert_eq!(round_m128d_s::<{ round_op!(Nearest) }>(a, b).to_array(), [0.0, 900.0]); /// assert_eq!(round_m128d_s::<{ round_op!(NegInf) }>(a, b).to_array(), [-1.0, 900.0]); /// // /// let b = m128d::from_array([2.4, f64::NAN]); /// // /// assert_eq!(round_m128d_s::<{ round_op!(PosInf) }>(a, b).to_array(), [3.0, 900.0]); /// assert_eq!(round_m128d_s::<{ round_op!(Zero) }>(a, b).to_array(), [2.0, 900.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn round_m128d_s(a: m128d, b: m128d) -> m128d { m128d(unsafe { _mm_round_sd(a.0, b.0, MODE) }) } /// Rounds each lane in the style specified. /// /// ``` /// # use safe_arch::*; /// let a = m128::from_array([-0.1, 1.6, 3.3, 4.5]); /// // /// assert_eq!(round_m128::<{ round_op!(Nearest) }>(a).to_array(), [0.0, 2.0, 3.0, 4.0]); /// // /// assert_eq!(round_m128::<{ round_op!(NegInf) }>(a).to_array(), [-1.0, 1.0, 3.0, 4.0]); /// // /// assert_eq!(round_m128::<{ round_op!(PosInf) }>(a).to_array(), [0.0, 2.0, 4.0, 5.0]); /// // /// assert_eq!(round_m128::<{ round_op!(Zero) }>(a).to_array(), [0.0, 1.0, 3.0, 4.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn round_m128(a: m128) -> m128 { m128(unsafe { _mm_round_ps(a.0, MODE) }) } /// Rounds `$b` low as specified, other lanes use `$a`. /// /// ``` /// # use safe_arch::*; /// let a = m128::from_array([f32::NAN, 6.0, 7.0, 8.0]); /// // /// let b = m128::from_array([-0.1, f32::NAN, f32::NAN, f32::NAN]); /// // /// assert_eq!(round_m128_s::<{ round_op!(Nearest) }>(a, b).to_array(), [0.0, 6.0, 7.0, 8.0]); /// assert_eq!(round_m128_s::<{ round_op!(NegInf) }>(a, b).to_array(), [-1.0, 6.0, 7.0, 8.0]); /// // /// let b = m128::from_array([2.4, f32::NAN, f32::NAN, f32::NAN]); /// // /// assert_eq!(round_m128_s::<{ round_op!(PosInf) }>(a, b).to_array(), [3.0, 6.0, 7.0, 8.0]); /// assert_eq!(round_m128_s::<{ round_op!(Zero) }>(a, b).to_array(), [2.0, 6.0, 7.0, 8.0]); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn round_m128_s(a: m128, b: m128) -> m128 { m128(unsafe { _mm_round_ss(a.0, b.0, MODE) }) } /// Tests if all bits are 1. /// /// ``` /// # use safe_arch::*; /// let a = m128i::from(0_u128); /// let b = m128i::from(u128::MAX); /// assert_eq!(test_all_ones_m128i(a), 0); /// assert_eq!(test_all_ones_m128i(b), 1); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn test_all_ones_m128i(a: m128i) -> i32 { unsafe { _mm_test_all_ones(a.0) } } /// Returns if all masked bits are 0, `(a & mask) as u128 == 0` /// /// ``` /// # use safe_arch::*; /// let a = m128i::from(0b111_u128); /// let mask = m128i::from(u128::MAX); /// assert_eq!(test_all_zeroes_m128i(a, mask), 0); /// // /// let a = m128i::from(0b0_u128); /// let mask = m128i::from(u128::MAX); /// assert_eq!(test_all_zeroes_m128i(a, mask), 1); /// // /// let a = m128i::from(0b1_0000_u128); /// let mask = m128i::from(0b0_1111_u128); /// assert_eq!(test_all_zeroes_m128i(a, mask), 1); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn test_all_zeroes_m128i(a: m128i, mask: m128i) -> i32 { unsafe { _mm_test_all_zeros(a.0, mask.0) } } /// Returns if, among the masked bits, there's both 0s and 1s /// /// * Zero Flag = `(a & mask) as u128 == 0` /// * Carry Flag = `((!a) & mask) as u128 == 0` /// * Return `ZeroFlag == 0 && Carry Flag == 0` /// /// ``` /// # use safe_arch::*; /// let a = m128i::from(0b111_u128); /// let mask = m128i::from(u128::MAX); /// assert_eq!(test_mixed_ones_and_zeroes_m128i(a, mask), 1); /// // /// let a = m128i::from(0b0_u128); /// let mask = m128i::from(u128::MAX); /// assert_eq!(test_mixed_ones_and_zeroes_m128i(a, mask), 0); /// // /// let a = m128i::from(0b1_0000_u128); /// let mask = m128i::from(0b0_1111_u128); /// assert_eq!(test_mixed_ones_and_zeroes_m128i(a, mask), 0); /// // /// let a = m128i::from(0b1_0000_u128); /// let mask = m128i::from(0b1_1111_u128); /// assert_eq!(test_mixed_ones_and_zeroes_m128i(a, mask), 1); /// ``` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.1")))] pub fn test_mixed_ones_and_zeroes_m128i(a: m128i, mask: m128i) -> i32 { unsafe { _mm_test_mix_ones_zeros(a.0, mask.0) } } safe_arch-0.7.1/src/x86_x64/sse4_2.rs000066400000000000000000000153551445526200400170270ustar00rootroot00000000000000#![cfg(target_feature = "sse4.2")] use super::*; /// Lanewise `a > b` with lanes as `i64`. /// /// All bits 1 for true (`-1`), all bit 0 for false (`0`). /// /// * **Intrinsic:** [`_mm_cmpgt_epi64`] /// * **Assembly:** `pcmpgtq xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.2")))] pub fn cmp_gt_mask_i64_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_cmpgt_epi64(a.0, b.0) }) } /// Accumulates the `u8` into a running CRC32 value. /// /// * **Intrinsic:** [`_mm_crc32_u8`] /// * **Assembly:** `crc32 r32, r8` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.2")))] pub fn crc32_u8(crc: u32, v: u8) -> u32 { unsafe { _mm_crc32_u8(crc, v) } } /// Accumulates the `u16` into a running CRC32 value. /// /// * **Intrinsic:** [`_mm_crc32_u16`] /// * **Assembly:** `crc32 r32, r16` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.2")))] pub fn crc32_u16(crc: u32, v: u16) -> u32 { unsafe { _mm_crc32_u16(crc, v) } } /// Accumulates the `u32` into a running CRC32 value. /// /// * **Intrinsic:** [`_mm_crc32_u32`] /// * **Assembly:** `crc32 r32, r32` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.2")))] pub fn crc32_u32(crc: u32, v: u32) -> u32 { unsafe { _mm_crc32_u32(crc, v) } } /// Accumulates the `u64` into a running CRC32 value. /// /// **Note:** Has a different return type from the other crc32 functions. /// /// * **Intrinsic:** [`_mm_crc32_u64`] /// * **Assembly:** `crc32 r64, r64` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.2")))] pub fn crc32_u64(crc: u64, v: u64) -> u64 { unsafe { _mm_crc32_u64(crc, v) } } /// string segment elements are u8 values pub const STR_CMP_U8: i32 = _SIDD_UBYTE_OPS; /// string segment elements are u16 values pub const STR_CMP_U16: i32 = _SIDD_UWORD_OPS; /// string segment elements are i8 values pub const STR_CMP_I8: i32 = _SIDD_SBYTE_OPS; /// string segment elements are i16 values pub const STR_CMP_I16: i32 = _SIDD_SWORD_OPS; /// Matches when _any_ haystack character equals _any_ needle character, /// regardless of position. pub const STR_CMP_EQ_ANY: i32 = _SIDD_CMP_EQUAL_ANY; /// Interprets consecutive pairs of characters in the needle as `(low..=high)` /// ranges to compare each haystack character to. pub const STR_CMP_RANGES: i32 = _SIDD_CMP_RANGES; /// Matches when a character position in the needle is equal to the character at /// the same position in the haystack. pub const STR_CMP_EQ_EACH: i32 = _SIDD_CMP_EQUAL_EACH; /// Matches when the complete needle string is a substring somewhere in the /// haystack. pub const STR_CMP_EQ_ORDERED: i32 = _SIDD_CMP_EQUAL_ORDERED; /// Return the index of the first match found. pub const STR_CMP_FIRST_MATCH: i32 = _SIDD_LEAST_SIGNIFICANT; /// Return the index of the last match found. pub const STR_CMP_LAST_MATCH: i32 = _SIDD_MOST_SIGNIFICANT; /// Return the bitwise mask of matches. pub const STR_CMP_BIT_MASK: i32 = _SIDD_BIT_MASK; /// Return the lanewise mask of matches. pub const STR_CMP_UNIT_MASK: i32 = _SIDD_UNIT_MASK; /// Search for `needle` in `haystack, with implicit string length. /// /// In the constant you need to provide (combine with `|`): /// * A comparison unit: `STR_CMP_U8`, `STR_CMP_U16`, `STR_CMP_I8`, or /// `STR_CMP_I16`. /// * A comparison op: `STR_CMP_EQ_ANY`, `STR_CMP_RANGES`, `STR_CMP_EQ_EACH`, or /// `STR_CMP_EQ_ORDERED`. /// * The desired output: `STR_CMP_FIRST_MATCH` or `STR_CMP_LAST_MATCH`. /// /// The first 0 unit is a null terminator for the string. If the string has no 0 /// units then the string ends at the end of the register. /// /// If there's no match the output is the length of the haystack. /// /// * **Intrinsic:** [`_mm_cmpistri`] /// * **Assembly:** `pcmpistri xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.2")))] pub fn search_implicit_str_for_index( needle: m128i, haystack: m128i, ) -> i32 { unsafe { _mm_cmpistri(needle.0, haystack.0, IMM) } } /// Search for `needle` in `haystack, with explicit string length. /// /// In the constant you need to provide (combine with `|`): /// * A comparison unit: `STR_CMP_U8`, `STR_CMP_U16`, `STR_CMP_I8`, or /// `STR_CMP_I16`. /// * A comparison op: `STR_CMP_EQ_ANY`, `STR_CMP_RANGES`, `STR_CMP_EQ_EACH`, or /// `STR_CMP_EQ_ORDERED`. /// * The desired output: `STR_CMP_FIRST_MATCH` or `STR_CMP_LAST_MATCH`. /// /// If there's no match the output is the length of the haystack. /// /// * **Intrinsic:** [`_mm_cmpestri`] /// * **Assembly:** `pcmpestri xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.2")))] pub fn search_explicit_str_for_index( needle: m128i, needle_len: i32, haystack: m128i, haystack_len: i32, ) -> i32 { unsafe { _mm_cmpestri(needle.0, needle_len, haystack.0, haystack_len, IMM) } } /// Search for `needle` in `haystack, with implicit string length. /// /// In the constant you need to provide (combine with `|`): /// * A comparison unit: `STR_CMP_U8`, `STR_CMP_U16`, `STR_CMP_I8`, or /// `STR_CMP_I16`. /// * A comparison op: `STR_CMP_EQ_ANY`, `STR_CMP_RANGES`, `STR_CMP_EQ_EACH`, or /// `STR_CMP_EQ_ORDERED`. /// * The desired out mask style: `STR_CMP_BIT_MASK` or `STR_CMP_UNIT_MASK`. /// /// The first 0 unit is a null terminator for the string. If the string has no 0 /// units then the string ends at the end of the register. /// /// * **Intrinsic:** [`_mm_cmpistrm`] /// * **Assembly:** `pcmpistrm xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.2")))] pub fn search_implicit_str_for_mask( needle: m128i, haystack: m128i, ) -> m128i { m128i(unsafe { _mm_cmpistrm(needle.0, haystack.0, IMM) }) } /// Search for `needle` in `haystack, with explicit string length. /// /// In the constant you need to provide (combine with `|`): /// * A comparison unit: `STR_CMP_U8`, `STR_CMP_U16`, `STR_CMP_I8`, or /// `STR_CMP_I16`. /// * A comparison op: `STR_CMP_EQ_ANY`, `STR_CMP_RANGES`, `STR_CMP_EQ_EACH`, or /// `STR_CMP_EQ_ORDERED`. /// * The desired out mask style: `STR_CMP_BIT_MASK` or `STR_CMP_UNIT_MASK`. /// /// If there's no match the output is the length of the haystack. /// /// * **Intrinsic:** [`_mm_cmpestrm`] /// * **Assembly:** `pcmpestrm xmm, xmm, imm8` #[must_use] #[inline(always)] #[cfg(target_arch = "x86_64")] #[cfg_attr(docs_rs, doc(cfg(target_feature = "sse4.2")))] pub fn search_explicit_str_for_mask( needle: m128i, needle_len: i32, haystack: m128i, haystack_len: i32, ) -> m128i { m128i(unsafe { _mm_cmpestrm(needle.0, needle_len, haystack.0, haystack_len, IMM) }) } safe_arch-0.7.1/src/x86_x64/ssse3.rs000066400000000000000000000151151445526200400167620ustar00rootroot00000000000000#![cfg(target_feature = "ssse3")] use super::*; /// Lanewise absolute value with lanes as `i8`. /// /// This is a "wrapping" absolute value, so `i8::MIN` stays as `i8::MIN`. /// /// * **Intrinsic:** [`_mm_abs_epi8`] /// * **Assembly:** `pabsb xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn abs_i8_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_abs_epi8(a.0) }) } /// Lanewise absolute value with lanes as `i16`. /// /// This is a "wrapping" absolute value, so `i16::MIN` stays as `i16::MIN`. /// /// * **Intrinsic:** [`_mm_abs_epi16`] /// * **Assembly:** `pabsw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn abs_i16_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_abs_epi16(a.0) }) } /// Lanewise absolute value with lanes as `i32`. /// /// This is a "wrapping" absolute value, so `i32::MIN` stays as `i32::MIN`. /// /// * **Intrinsic:** [`_mm_abs_epi32`] /// * **Assembly:** `pabsd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn abs_i32_m128i(a: m128i) -> m128i { m128i(unsafe { _mm_abs_epi32(a.0) }) } /// Counts `$a` as the high bytes and `$b` as the low bytes then performs a /// **byte** shift to the right by the immediate value. /// /// Remember that this is all little-endian data. /// /// * **Intrinsic:** [`_mm_alignr_epi8`] /// * **Assembly:** `palignr xmm, xmm, imm8` pub fn combined_byte_shr_imm_m128i( a: m128i, b: m128i, ) -> m128i { m128i(unsafe { _mm_alignr_epi8(a.0, b.0, IMM) }) } /// Add horizontal pairs of `i16` values, pack the outputs as `a` then `b`. /// /// * **Intrinsic:** [`_mm_hadd_epi16`] /// * **Assembly:** `phaddw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn add_horizontal_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_hadd_epi16(a.0, b.0) }) } /// Add horizontal pairs of `i32` values, pack the outputs as `a` then `b`. /// /// * **Intrinsic:** [`_mm_hadd_epi32`] /// * **Assembly:** `phaddd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn add_horizontal_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_hadd_epi32(a.0, b.0) }) } /// Add horizontal pairs of `i16` values, saturating, pack the outputs as `a` /// then `b`. /// /// * **Intrinsic:** [`_mm_hadds_epi16`] /// * **Assembly:** `phaddsw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn add_horizontal_saturating_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_hadds_epi16(a.0, b.0) }) } /// Subtract horizontal pairs of `i16` values, pack the outputs as `a` then `b`. /// /// * **Intrinsic:** [`_mm_hsub_epi16`] /// * **Assembly:** `phsubw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn sub_horizontal_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_hsub_epi16(a.0, b.0) }) } /// Subtract horizontal pairs of `i32` values, pack the outputs as `a` then `b`. /// /// * **Intrinsic:** [`_mm_hsub_epi32`] /// * **Assembly:** `phsubd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn sub_horizontal_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_hsub_epi32(a.0, b.0) }) } /// Subtract horizontal pairs of `i16` values, saturating, pack the outputs as /// `a` then `b`. /// /// * **Intrinsic:** [`_mm_hsubs_epi16`] /// * **Assembly:** `phsubsw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn sub_horizontal_saturating_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_hsubs_epi16(a.0, b.0) }) } /// This is dumb and weird. /// /// * Vertically multiplies each `u8` lane from `a` with an `i8` lane from `b`, /// producing an `i16` intermediate value. /// * These intermediate `i16` values are horizontally added with saturation. /// /// * **Intrinsic:** [`_mm_maddubs_epi16`] /// * **Assembly:** `pmaddubsw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn mul_u8i8_add_horizontal_saturating_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_maddubs_epi16(a.0, b.0) }) } /// Multiply `i16` lanes into `i32` intermediates, keep the high 18 bits, round /// by adding 1, right shift by 1. /// /// This is `_mm_mulhrs_epi16`, which I can only assume is named for something /// like "high bits rounded and scaled". /// /// * **Intrinsic:** [`_mm_mulhrs_epi16`] /// * **Assembly:** `pmulhrsw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn mul_i16_scale_round_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_mulhrs_epi16(a.0, b.0) }) } /// Shuffle `i8` lanes in `a` using `i8` values in `v`. /// /// If a lane in `v` is negative, that output is zeroed. /// /// * **Intrinsic:** [`_mm_shuffle_epi8`] /// * **Assembly:** `pshufb xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn shuffle_av_i8z_all_m128i(a: m128i, v: m128i) -> m128i { m128i(unsafe { _mm_shuffle_epi8(a.0, v.0) }) } /// Applies the sign of `i8` values in `b` to the values in `a`. /// /// * If `b` is negative: the `a` value is negated. /// * Else If `b` is 0: the `a` value becomes 0. /// * Else the `a` value is unchanged. /// /// * **Intrinsic:** [`_mm_sign_epi8`] /// * **Assembly:** `psignb xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn sign_apply_i8_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_sign_epi8(a.0, b.0) }) } /// Applies the sign of `i16` values in `b` to the values in `a`. /// /// * If `b` is negative: the `a` value is negated. /// * Else If `b` is 0: the `a` value becomes 0. /// * Else the `a` value is unchanged. /// /// * **Intrinsic:** [`_mm_sign_epi16`] /// * **Assembly:** `psignw xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn sign_apply_i16_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_sign_epi16(a.0, b.0) }) } /// Applies the sign of `i32` values in `b` to the values in `a`. /// /// * If `b` is negative: the `a` value is negated. /// * Else If `b` is 0: the `a` value becomes 0. /// * Else the `a` value is unchanged. /// /// * **Intrinsic:** [`_mm_sign_epi32`] /// * **Assembly:** `psignd xmm, xmm` #[must_use] #[inline(always)] #[cfg_attr(docs_rs, doc(cfg(target_feature = "ssse3")))] pub fn sign_apply_i32_m128i(a: m128i, b: m128i) -> m128i { m128i(unsafe { _mm_sign_epi32(a.0, b.0) }) } safe_arch-0.7.1/tests/000077500000000000000000000000001445526200400145765ustar00rootroot00000000000000safe_arch-0.7.1/tests/integration/000077500000000000000000000000001445526200400171215ustar00rootroot00000000000000safe_arch-0.7.1/tests/integration/adx_tests.rs000066400000000000000000000005051445526200400214650ustar00rootroot00000000000000use super::*; #[test] fn test_add_carry_u32() { let mut out = 0_u32; assert_eq!(add_carry_u32(1, u32::MAX, 5, &mut out), 1); assert_eq!(out, 5); } #[test] #[cfg(target_arch = "x86_64")] fn test_add_carry_u64() { let mut out = 0_u64; assert_eq!(add_carry_u64(1, u64::MAX, 5, &mut out), 1); assert_eq!(out, 5); } safe_arch-0.7.1/tests/integration/avx_tests.rs000066400000000000000000001050241445526200400215110ustar00rootroot00000000000000use super::*; #[test] fn test_add_m256d() { let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); let b = m256d::from_array([5.0, 6.0, 7.0, 8.5]); let c = add_m256d(a, b).to_array(); assert_eq!(c, [6.0, 8.0, 10.0, 12.5]); } #[test] fn test_add_m256() { let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 20.0, 30.0, 40.0, 50.0]); let b = m256::from_array([5.0, 6.0, 7.0, 8.5, 90.0, 100.0, 110.0, 51.0]); let c = add_m256(a, b).to_array(); assert_eq!(c, [6.0, 8.0, 10.0, 12.5, 110.0, 130.0, 150.0, 101.0]); } #[test] fn test_addsub_m256d() { let a = m256d::from_array([10.0, 20.0, 30.0, 40.0]); let b = m256d::from_array([100.0, 200.0, 300.0, 400.0]); let c = addsub_m256d(a, b).to_array(); assert_eq!(c, [-90.0, 220.0, -270.0, 440.0]); } #[test] fn test_addsub_m256() { let a = m256::from_array([10.0, 20.0, 30.0, 40.0, 1.0, 2.0, 3.0, 4.0]); let b = m256::from_array([1.0, 20.0, 3.0, 40.0, 11.0, 12.0, 13.0, 14.0]); let c = addsub_m256(a, b).to_array(); assert_eq!(c, [9.0, 40.0, 27.0, 80.0, -10.0, 14.0, -10.0, 18.0]); } #[test] fn test_bitand_m256d() { let a = m256d::from_array([1.0, 0.0, 1.0, 0.0]); let b = m256d::from_array([1.0, 1.0, 0.0, 0.0]); let c = bitand_m256d(a, b).to_array(); assert_eq!(c, [1.0, 0.0, 0.0, 0.0]); } #[test] fn test_bitand_m256() { let a = m256::from_array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]); let b = m256::from_array([1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]); let c = bitand_m256(a, b).to_array(); assert_eq!(c, [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]); } #[test] fn test_bitandnot_m256d() { let a = m256d::from_array([1.0, 0.0, 1.0, 0.0]); let b = m256d::from_array([1.0, 1.0, 0.0, 0.0]); let c = bitandnot_m256d(a, b).to_array(); assert_eq!(c, [0.0, 1.0, 0.0, 0.0]); } #[test] fn test_bitandnot_m256() { let a = m256::from_array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]); let b = m256::from_array([1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]); let c = bitandnot_m256(a, b).to_array(); assert_eq!(c, [0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]); } #[test] fn test_blend_m256d() { let a = m256d::from_array([10.0, 20.0, 30.0, 40.0]); let b = m256d::from_array([100.0, 200.0, 300.0, 400.0]); // let c = blend_m256d::<0b0110>(a, b).to_array(); assert_eq!(c, [10.0, 200.0, 300.0, 40.0]); } #[test] fn test_blend_m256() { let a = m256::from_array([10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0]); let b = m256::from_array([100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0]); // let c = blend_m256::<0b0011_0110>(a, b).to_array(); assert_eq!(c, [10.0, 200.0, 300.0, 40.0, 500.0, 600.0, 70.0, 80.0]); } #[test] fn test_blend_varying_m256d() { let a = m256d::from_array([0.0, 1.0, 20.0, 30.0]); let b = m256d::from_array([2.0, 3.0, 70.0, 80.0]); let mask = m256d::from_array([-1.0, 0.0, 0.0, -1.0]); let c = blend_varying_m256d(a, b, mask).to_array(); assert_eq!(c, [2.0, 1.0, 20.0, 80.0]); } #[test] fn test_blend_varying_m256() { let a = m256::from_array([0.0, 1.0, 2.0, 3.0, 8.0, 9.0, 10.0, 11.0]); let b = m256::from_array([4.0, 5.0, 6.0, 7.0, -4.0, -5.0, -6.0, -7.0]); let mask = m256::from_array([-1.0, 0.0, -1.0, 0.0, -1.0, -1.0, 0.0, 0.0]); let c = blend_varying_m256(a, b, mask).to_array(); assert_eq!(c, [4.0, 1.0, 6.0, 3.0, -4.0, -5.0, 10.0, 11.0]); } #[test] fn test_load_m128d_splat_m256d() { let a = m128d::from_array([0.0, 1.0]); let b = load_m128d_splat_m256d(&a).to_array(); assert_eq!(b, [0.0, 1.0, 0.0, 1.0]); } #[test] fn test_load_m128_splat_m256() { let a = m128::from_array([0.0, 1.0, 2.0, 3.0]); let b = load_m128_splat_m256(&a).to_array(); assert_eq!(b, [0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0]); } #[test] fn test_load_f64_splat_m256d() { let a = 1.0; let b = load_f64_splat_m256d(&a).to_array(); assert_eq!(b, [1.0, 1.0, 1.0, 1.0]); } #[test] fn test_load_f32_splat_m256() { let a = 1.0; let b = load_f32_splat_m256(&a).to_array(); assert_eq!(b, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]); } #[test] fn test_cast_to_m256_from_m256d() { let a = load_f64_splat_m256d(&1.0); assert_eq!(cast_to_m256_from_m256d(a).to_bits(), [0, 0x3FF0_0000, 0, 0x3FF0_0000, 0, 0x3FF0_0000, 0, 0x3FF0_0000]); } #[test] fn test_cast_to_m256i_from_m256d() { let a = load_f64_splat_m256d(&1.0); let b: [u64; 4] = cast_to_m256i_from_m256d(a).into(); assert_eq!(b, [0x3FF00000_00000000_u64; 4]); } #[test] fn test_cast_to_m256d_from_m256i() { let a = m256i::from([1.0_f64.to_bits(); 4]); let b = cast_to_m256d_from_m256i(a).to_array(); assert_eq!(b, [1.0; 4]); } #[test] fn test_cast_to_m256_from_m256i() { let a = m256i::from([1.0_f32.to_bits(); 8]); let b = cast_to_m256_from_m256i(a).to_array(); assert_eq!(b, [1.0; 8]); } #[test] fn test_cast_to_m128_from_m256() { let a = m256::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let b = cast_to_m128_from_m256(a).to_array(); assert_eq!(b, [1.0, 2.0, 3.0, 4.0]); } #[test] fn test_cast_to_m128d_from_m256d() { let a = m256d::from([1.0, 2.0, 3.0, 4.0]); let b = cast_to_m128d_from_m256d(a).to_array(); assert_eq!(b, [1.0, 2.0]); } #[test] fn test_cast_to_m128i_from_m256i() { let a = m256i::from([1, 2, 3, 4, 5, 6, 7, 8]); let b: [i32; 4] = cast_to_m128i_from_m256i(a).into(); assert_eq!(b, [1, 2, 3, 4]); } #[test] fn test_ceil_m256d() { let a = m256d::from([1.1, 2.5, 3.8, 5.0]); let b = ceil_m256d(a).to_array(); assert_eq!(b, [2.0, 3.0, 4.0, 5.0]); } #[test] fn test_ceil_m256() { let a = m256::from([1.1, 2.5, 3.8, 5.0, -0.5, -1.1, -2.7, -3.0]); let b = ceil_m256(a).to_array(); assert_eq!(b, [2.0, 3.0, 4.0, 5.0, 0.0, -1.0, -2.0, -3.0]); } #[test] fn test_cmp_op_mask_m128() { let a = m128::from_array([2.0, 0.0, -2.0, 0.0]); let b = m128::from_array([1.0, 1.0, -1.0, -1.0]); let c = cmp_op_mask_m128::<{ cmp_op!(GreaterThanOrdered) }>(a, b).to_bits(); assert_eq!(c, [u32::MAX, 0, 0, u32::MAX]); } #[test] fn test_cmp_op_mask_m128_s() { let a = m128::from_array([2.0, 0.0, -2.0, 0.0]); let b = m128::from_array([1.0, 1.0, -1.0, -1.0]); let c = cmp_op_mask_m128_s::<{ cmp_op!(GreaterThanOrdered) }>(a, b).to_bits(); assert_eq!(c, [u32::MAX, 0, (-2_f32).to_bits(), 0]); } #[test] fn test_cmp_op_mask_m256() { let a = m256::from_array([1.0, 5.0, 0.0, 7.0, 5.0, 6.0, 7.0, -20.0]); let b = m256::from_array([2.0, 1.0, 3.0, 4.0, 1.0, -2.0, -3.0, -4.0]); let c = cmp_op_mask_m256::<{ cmp_op!(LessThanOrdered) }>(a, b).to_bits(); assert_eq!(c, [u32::MAX, 0, u32::MAX, 0, 0, 0, 0, u32::MAX]); } #[test] fn test_cmp_op_mask_m128d() { let a = m128d::from_array([1.0, 0.0]); let b = m128d::from_array([1.0, 1.0]); let c = cmp_op_mask_m128d::<{ cmp_op!(EqualOrdered) }>(a, b).to_bits(); assert_eq!(c, [u64::MAX, 0]); } #[test] fn test_cmp_op_mask_m128d_s() { let a = m128d::from_array([1.0, 7.0]); let b = m128d::from_array([1.0, 1.0]); let c = cmp_op_mask_m128d_s::<{ cmp_op!(EqualOrdered) }>(a, b).to_bits(); assert_eq!(c, [u64::MAX, 7_f64.to_bits()]); } #[test] fn test_cmp_op_mask_m256d() { let a = m256d::from_array([1.0, 5.0, 0.0, 7.0]); let b = m256d::from_array([2.0, 1.0, 3.0, 4.0]); let c = cmp_op_mask_m256d::<{ cmp_op!(LessThanOrdered) }>(a, b).to_bits(); assert_eq!(c, [u64::MAX, 0, u64::MAX, 0]); } #[test] fn test_convert_to_m256d_from_i32_m128i() { let a = m128i::from([4, 5, 6, 7]); let b = convert_to_m256d_from_i32_m128i(a).to_array(); assert_eq!(b, [4.0, 5.0, 6.0, 7.0]); } #[test] fn test_convert_to_m256_from_i32_m256i() { let a = m256i::from([4, 5, 6, 7, 8, -9, 1, 0]); let b = convert_to_m256_from_i32_m256i(a).to_array(); assert_eq!(b, [4.0, 5.0, 6.0, 7.0, 8.0, -9.0, 1.0, 0.0]); } #[test] fn test_convert_to_i32_m128i_from_m256d() { let a = m256d::from([4.0, 5.0, 6.0, 7.0]); let b: [i32; 4] = convert_to_i32_m128i_from_m256d(a).into(); assert_eq!(b, [4, 5, 6, 7]); } #[test] fn test_convert_to_m128_from_m256d() { let a = m256d::from([4.0, 5.0, 6.0, 7.0]); let b = convert_to_m128_from_m256d(a).to_array(); assert_eq!(b, [4.0, 5.0, 6.0, 7.0]); } #[test] fn test_convert_to_i32_m256i_from_m256() { let a = m256::from([4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]); let b: [i32; 8] = convert_to_i32_m256i_from_m256(a).into(); assert_eq!(b, [4, 5, 6, 7, 8, 9, 10, 11]); } #[test] fn test_convert_to_m256d_from_m128() { let a = m128::from([4.0, 5.0, 6.0, 7.0]); let b = convert_to_m256d_from_m128(a).to_array(); assert_eq!(b, [4.0, 5.0, 6.0, 7.0]); } #[test] fn test_convert_to_f64_from_m256d_s() { let a = m256d::from([4.0, 5.0, 6.0, 7.0]); let b = convert_to_f64_from_m256d_s(a); assert_eq!(b, 4.0); } #[test] fn test_convert_to_i32_from_m256i_s() { let a = m256i::from([4, 5, 6, 7, 8, 9, 10, 11]); let b = convert_to_i32_from_m256i_s(a); assert_eq!(b, 4); } #[test] fn test_convert_to_f32_from_m256_s() { let a = m256::from([4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]); let b = convert_to_f32_from_m256_s(a); assert_eq!(b, 4.0); } #[test] fn test_div_m256d() { let a = m256d::from([4.0, 5.0, 6.0, 7.0]); let b = m256d::from([2.0, 2.0, 3.0, 7.0]); let c = div_m256d(a, b).to_array(); assert_eq!(c, [2.0, 2.5, 2.0, 1.0]); } #[test] fn test_div_m256() { let a = m256::from_array([4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0]); let b = m256::from_array([2.0, 2.0, 3.0, 7.0, 2.0, 3.0, 4.0, 11.0]); let c = div_m256(a, b).to_array(); assert_eq!(c, [2.0, 2.5, 2.0, 1.0, 4.0, 3.0, 2.5, 1.0]); } #[test] fn test_dot_product_m256() { let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let b = m256::from_array([9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]); let c = dot_product_m256::<0b1111_1111>(a, b).to_array(); assert_eq!(c, [110.0, 110.0, 110.0, 110.0, 382.0, 382.0, 382.0, 382.0]); } #[test] fn test_extract_i32_from_m256i() { let a = m256i::from([9, 10, 11, 12, 13, 14, 15, 16]); assert_eq!(extract_i32_from_m256i::<3>(a), 12); } #[test] #[cfg(target_arch = "x86_64")] fn test_extract_i64_from_m256i() { let a = m256i::from([9_i64, 10, 11, 12]); assert_eq!(extract_i64_from_m256i::<1>(a), 10_i64); } #[test] fn test_extract_m128d_from_m256d() { let a = m256d::from([13.0, 14.0, 15.0, 16.0]); let b = m128d::from([15.0, 16.0]).to_array(); let c = extract_m128d_from_m256d::<1>(a).to_array(); assert_eq!(b, c); } #[test] fn test_extract_m128_from_m256() { let a = m256::from([9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]); let b = m128::from([13.0, 14.0, 15.0, 16.0]).to_array(); let c = extract_m128_from_m256::<1>(a).to_array(); assert_eq!(b, c); } #[test] fn test_extract_m128i_from_m256i() { let a = m256i::from([9, 10, 11, 12, 13, 14, 15, 16]); let b: [i32; 4] = m128i::from([13, 14, 15, 16]).into(); let c: [i32; 4] = extract_m128i_from_m256i::<1>(a).into(); assert_eq!(b, c); } #[test] fn test_floor_m256d() { let a = m256d::from([1.1, 2.5, 3.8, 5.0]); let b = floor_m256d(a).to_array(); assert_eq!(b, [1.0, 2.0, 3.0, 5.0]); } #[test] fn test_floor_m256() { let a = m256::from([1.1, 2.5, 3.8, 5.0, -0.5, -1.1, -2.7, -3.0]); let b = floor_m256(a).to_array(); assert_eq!(b, [1.0, 2.0, 3.0, 5.0, -1.0, -2.0, -3.0, -3.0]); } #[test] fn test_add_horizontal_m256d() { let a = m256d::from([1.0, 2.0, 3.0, 4.0]); let b = m256d::from([1.0, 3.0, 5.0, 7.0]); let c = add_horizontal_m256d(a, b).to_array(); assert_eq!(c, [3.0, 4.0, 7.0, 12.0]); } #[test] fn test_add_horizontal_m256() { let a = m256::from([8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]); let b = m256::from([0.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]); let c = add_horizontal_m256(a, b).to_array(); assert_eq!(c, [15.0, 11.0, 2.0, 12.0, 7.0, 3.0, 48.0, 192.0]); } #[test] fn test_sub_horizontal_m256d() { let a = m256d::from([1.0, 2.0, 3.0, 4.0]); let b = m256d::from([1.0, 3.0, 5.0, 70.0]); let c = sub_horizontal_m256d(a, b).to_array(); assert_eq!(c, [-1.0, -2.0, -1.0, -65.0]); } #[test] fn test_sub_horizontal_m256() { let a = m256::from([8.0, 17.0, 6.0, 5.0, 4.0, 23.0, 2.0, 1.0]); let b = m256::from([0.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0]); let c = sub_horizontal_m256(a, b).to_array(); assert_eq!(c, [-9.0, 1.0, -2.0, -4.0, -19.0, 1.0, -16.0, -64.0]); } #[test] fn test_insert_i8_to_m256i() { let a = m256i::from([0_i8; 32]); let b: [i8; 32] = insert_i8_to_m256i::<3>(a, 5).into(); let c: [i8; 32] = m256i::from([0_i8, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).into(); assert_eq!(b, c); } #[test] fn test_insert_i16_to_m256i() { let a = m256i::from([0_i16; 16]); let b: [i16; 16] = insert_i16_to_m256i::<3>(a, 5).into(); let c: [i16; 16] = m256i::from([0_i16, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]).into(); assert_eq!(b, c); } #[test] fn test_insert_i32_to_m256i() { let a = m256i::from([0_i32; 8]); let b: [i32; 8] = insert_i32_to_m256i::<3>(a, 5).into(); let c: [i32; 8] = m256i::from([0, 0, 0, 5, 0, 0, 0, 0]).into(); assert_eq!(b, c); } #[test] #[cfg(target_arch = "x86_64")] fn test_insert_i64_to_m256i() { let a = m256i::from([0_i64; 4]); let b: [i64; 4] = insert_i64_to_m256i::<3>(a, 5).into(); let c: [i64; 4] = m256i::from([0, 0, 0, 5_i64]).into(); assert_eq!(b, c); } #[test] fn test_insert_m128d_to_m256d() { let a = m256d::from([0.0; 4]); let b: [f64; 4] = insert_m128d_to_m256d::<1>(a, m128d::from([3.0, 4.0])).to_array(); assert_eq!(b, [0.0, 0.0, 3.0, 4.0]); } #[test] fn test_insert_m128_to_m256() { let a = m256::from([0.0; 8]); let b: [f32; 8] = insert_m128_to_m256::<1>(a, m128::from([1.0, 2.0, 3.0, 4.0])).to_array(); assert_eq!(b, [0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 4.0]); } #[test] fn test_insert_m128i_to_m256i_slow_avx() { let a = m256i::from([0_i32; 8]); let b: [i32; 8] = insert_m128i_to_m256i_slow_avx::<1>(a, m128i::from([1, 2, 3, 4])).into(); assert_eq!(b, [0, 0, 0, 0, 1, 2, 3, 4]); } #[test] fn test_load_m256d() { let a = m256d::from([8.0, 17.0, 6.0, 5.0]); let b = load_m256d(&a); assert_eq!(a.to_array(), b.to_array()); } #[test] fn test_load_m256() { let a = m256::from([8.0, 17.0, 6.0, 5.0, 4.0, 23.0, 2.0, 1.0]); let b = load_m256(&a); assert_eq!(a.to_array(), b.to_array()); } #[test] fn test_load_m256i() { let a = m256i::from([8, 17, 6, 5, 4, 23, 2, 1]); let b = load_m256i(&a); assert_eq!(<[i32; 8]>::from(a), <[i32; 8]>::from(b)); } #[test] fn test_load_unaligned_m256d() { assert_eq!(load_unaligned_m256d(&[8.0, 17.0, 6.0, 5.0]).to_array(), [8.0, 17.0, 6.0, 5.0]); } #[test] fn test_load_unaligned_m256() { assert_eq!(load_unaligned_m256(&[8.0, 17.0, 6.0, 5.0, 1.0, 2.0, 3.0, 4.0]).to_array(), [8.0, 17.0, 6.0, 5.0, 1.0, 2.0, 3.0, 4.0]); } #[test] fn test_load_unaligned_m256i() { assert_eq!(<[i8; 32]>::from(load_unaligned_m256i(&[7_i8; 32])), [7_i8; 32]); } #[test] fn test_load_unaligned_hi_lo_m256d() { assert_eq!(load_unaligned_hi_lo_m256d(&[3.0, 4.0], &[1.0, 2.0]).to_array(), [1.0, 2.0, 3.0, 4.0]); } #[test] fn test_load_unaligned_hi_lo_m256() { assert_eq!(load_unaligned_hi_lo_m256(&[5.0, 6.0, 7.0, 8.0], &[1.0, 2.0, 3.0, 4.0]).to_array(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); } #[test] fn test_load_unaligned_hi_lo_m256i() { assert_eq!(<[i8; 32]>::from(load_unaligned_hi_lo_m256i(&[7_i8; 16], &[9_i8; 16])), [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,]); } #[test] fn test_load_masked_m128d() { let a = m128d::from([8.0, 17.0]); let b = load_masked_m128d(&a, m128i::from([0_i64, -1])).to_array(); assert_eq!(b, [0.0, 17.0]); } #[test] fn test_load_masked_m256d() { let a = m256d::from([8.0, 17.0, 16.0, 20.0]); let b = load_masked_m256d(&a, m256i::from([0_i64, -1, -1, 0])).to_array(); assert_eq!(b, [0.0, 17.0, 16.0, 0.0]); } #[test] fn test_load_masked_m128() { let a = m128::from([8.0, 17.0, 16.0, 12.0]); let b = load_masked_m128(&a, m128i::from([0, -1, -1, 0])).to_array(); assert_eq!(b, [0.0, 17.0, 16.0, 0.0]); } #[test] fn test_load_masked_m256() { let a = m256::from([8.0, 17.0, 16.0, 20.0, 80.0, 1.0, 2.0, 3.0]); let b = load_masked_m256(&a, m256i::from([0, -1, -1, 0, -1, -1, 0, 0])).to_array(); assert_eq!(b, [0.0, 17.0, 16.0, 0.0, 80.0, 1.0, 0.0, 0.0]); } #[test] fn test_store_masked_m128d() { let mut a = m128d::default(); store_masked_m128d(&mut a, m128i::from([0_i64, -1]), m128d::from([8.0, 17.0])); assert_eq!(a.to_array(), [0.0, 17.0]); } #[test] fn test_store_masked_m256d() { let mut a = m256d::default(); store_masked_m256d(&mut a, m256i::from([0_i64, -1, -1, 0]), m256d::from([8.0, 17.0, 16.0, 20.0])); assert_eq!(a.to_array(), [0.0, 17.0, 16.0, 0.0]); } #[test] fn test_store_masked_m128() { let mut a = m128::default(); store_masked_m128(&mut a, m128i::from([0, -1, -1, 0]), m128::from([8.0, 17.0, 16.0, 20.0])); assert_eq!(a.to_array(), [0.0, 17.0, 16.0, 0.0]); } #[test] fn test_store_masked_m256() { let mut a = m256::default(); store_masked_m256(&mut a, m256i::from([0, -1, -1, 0, -1, -1, 0, 0]), m256::from([8.0, 17.0, 16.0, 20.0, 80.0, 1.0, 2.0, 3.0])); assert_eq!(a.to_array(), [0.0, 17.0, 16.0, 0.0, 80.0, 1.0, 0.0, 0.0]); } #[test] fn test_max_m256d() { let a = m256d::from_array([1.0, 12.0, -1.0, 3.0]); let b = m256d::from_array([5.0, 6.0, -0.5, 2.2]); let c = max_m256d(a, b).to_array(); assert_eq!(c, [5.0, 12.0, -0.5, 3.0]); } #[test] fn test_max_m256() { let a = m256::from_array([1.0, 12.0, -1.0, 3.0, 10.0, 0.0, 1.0, 2.0]); let b = m256::from_array([5.0, 6.0, -0.5, 2.2, 5.0, 6.0, 7.0, 8.0]); let c = max_m256(a, b).to_array(); assert_eq!(c, [5.0, 12.0, -0.5, 3.0, 10.0, 6.0, 7.0, 8.0]); } #[test] fn test_min_m256d() { let a = m256d::from_array([1.0, 12.0, -1.0, 3.0]); let b = m256d::from_array([5.0, 6.0, -0.5, 2.2]); let c = min_m256d(a, b).to_array(); assert_eq!(c, [1.0, 6.0, -1.0, 2.2]); } #[test] fn test_min_m256() { let a = m256::from_array([1.0, 12.0, -1.0, 3.0, 10.0, 0.0, 1.0, 2.0]); let b = m256::from_array([5.0, 6.0, -0.5, 2.2, 5.0, 6.0, 7.0, 8.0]); let c = min_m256(a, b).to_array(); assert_eq!(c, [1.0, 6.0, -1.0, 2.2, 5.0, 0.0, 1.0, 2.0]); } #[test] fn test_duplicate_odd_lanes_m256d() { let a = m256d::from_array([1.0, 12.0, -1.0, 3.0]); let c = duplicate_odd_lanes_m256d(a).to_array(); assert_eq!(c, [1.0, 1.0, -1.0, -1.0]); } #[test] fn test_duplicate_even_lanes_m256() { let a = m256::from_array([1.0, 12.0, -1.0, 3.0, 0.0, 7.0, 2.0, 50.0]); let c = duplicate_even_lanes_m256(a).to_array(); assert_eq!(c, [12.0, 12.0, 3.0, 3.0, 7.0, 7.0, 50.0, 50.0]); } #[test] fn test_duplicate_odd_lanes_m256() { let a = m256::from_array([1.0, 12.0, -1.0, 3.0, 0.0, 7.0, 2.0, 50.0]); let c = duplicate_odd_lanes_m256(a).to_array(); assert_eq!(c, [1.0, 1.0, -1.0, -1.0, 0.0, 0.0, 2.0, 2.0]); } #[test] fn test_move_mask_m256d() { assert_eq!(0b0100, move_mask_m256d(m256d::from([1.0, 12.0, -1.0, 3.0]))); } #[test] fn test_move_mask_m256() { assert_eq!(0b00110100, move_mask_m256(m256::from([1.0, 12.0, -1.0, 3.0, -1.0, -2.0, 3.0, 4.0]))); } #[test] fn test_mul_m256d() { let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); let b = m256d::from_array([5.0, 6.0, 7.0, 8.5]); let c = mul_m256d(a, b).to_array(); assert_eq!(c, [5.0, 12.0, 21.0, 34.0]); } #[test] fn test_mul_m256() { let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 20.0, 30.0, 40.0, 50.0]); let b = m256::from_array([5.0, 6.0, 7.0, 8.5, 90.0, 100.0, 110.0, 51.0]); let c = mul_m256(a, b).to_array(); assert_eq!(c, [5.0, 12.0, 21.0, 34.0, 1800.0, 3000.0, 4400.0, 2550.0]); } #[test] fn test_bitor_m256d() { let a = m256d::from_array([1.0, 1.0, 0.0, 0.0]); let b = m256d::from_array([1.0, 0.0, 1.0, 0.0]); let c = bitor_m256d(a, b).to_array(); assert_eq!(c, [1.0, 1.0, 1.0, 0.0]); } #[test] fn test_bitor_m256() { let a = m256::from_array([1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]); let b = m256::from_array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]); let c = bitor_m256(a, b).to_array(); assert_eq!(c, [1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0]); } #[test] fn test_permute_m128d() { let a = m128d::from_array([1.0, 2.0]); // let b = permute_m128d::<0b_0_1>(a).to_array(); assert_eq!(b, [2.0, 1.0]); } #[test] fn test_permute_m256d() { let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); // let b = permute_m256d::<0b_0_1_0_1>(a).to_array(); assert_eq!(b, [2.0, 1.0, 4.0, 3.0]); } #[test] fn test_permute_m128() { let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); // let b = permute_m128::<0b_00_00_00_00>(a).to_array(); assert_eq!(b, [1.0, 1.0, 1.0, 1.0]); // let b = permute_m128::<0b_11_00_01_00>(a).to_array(); assert_eq!(b, [1.0, 2.0, 1.0, 4.0]); // let b = permute_m128::<0b_10_10_00_00>(a).to_array(); assert_eq!(b, [1.0, 1.0, 3.0, 3.0]); } #[test] fn test_permute_m256() { let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); // let b = permute_m256::<0b_00_10_01_11>(a).to_array(); assert_eq!(b, [4.0, 2.0, 3.0, 1.0, 8.0, 6.0, 7.0, 5.0]); } #[test] fn test_permute2z_m256d() { let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); let b = m256d::from_array([5.0, 6.0, 7.0, 8.0]); // let c = permute2z_m256d::<0b1000_0010>(a, b).to_array(); assert_eq!(c, [5.0, 6.0, 0.0, 0.0]); // let c = permute2z_m256d::<0b0001_1000>(a, b).to_array(); assert_eq!(c, [0.0, 0.0, 3.0, 4.0]); } #[test] fn test_permute2z_m256() { let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let b = m256::from_array([9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]); // let c = permute2z_m256::<0b1000_0010>(a, b).to_array(); assert_eq!(c, [9.0, 10.0, 11.0, 12.0, 0.0, 0.0, 0.0, 0.0]); // let c = permute2z_m256::<0b0001_1000>(a, b).to_array(); assert_eq!(c, [0.0, 0.0, 0.0, 0.0, 5.0, 6.0, 7.0, 8.0]); } #[test] fn test_permute2z_m256i() { let a = m256i::from([1, 2, 3, 4, 5, 6, 7, 8]); let b = m256i::from([9, 10, 11, 12, 13, 14, 15, 16]); // let c: [i32; 8] = permute2z_m256i::<0b1000_0010>(a, b).into(); assert_eq!(c, [9, 10, 11, 12, 0, 0, 0, 0]); // let c: [i32; 8] = permute2z_m256i::<0b0001_1000>(a, b).into(); assert_eq!(c, [0, 0, 0, 0, 5, 6, 7, 8]); } #[test] fn test_shuffle_av_f64_all_m128d() { let a = m128d::from_array([2.0, 3.0]); let v = m128i::from([1_i64 << 1, 0 << 1]); let c = shuffle_av_f64_all_m128d(a, v).to_array(); assert_eq!(c, [3.0, 2.0]); } #[test] fn test_shuffle_av_f64_half_m256d() { let a = m256d::from_array([2.0, 3.0, 7.0, 8.0]); let v = m256i::from([1_i64 << 1, 0 << 1, 1 << 1, 1 << 1]); let c = shuffle_av_f64_half_m256d(a, v).to_array(); assert_eq!(c, [3.0, 2.0, 8.0, 8.0]); } #[test] fn test_shuffle_av_f32_all_m128() { let a = m128::from_array([5.0, 6.0, 7.0, 8.0]); let v = m128i::from([0, 2, 3, 1]); let c = shuffle_av_f32_all_m128(a, v).to_array(); assert_eq!(c, [5.0, 7.0, 8.0, 6.0]); } #[test] fn test_shuffle_av_f32_half_m256() { let a = m256::from_array([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]); let v = m256i::from([0, 2, 3, 1, 0, 3, 2, 2]); let c = shuffle_av_f32_half_m256(a, v).to_array(); assert_eq!(c, [0.0, 2.0, 3.0, 1.0, 4.0, 7.0, 6.0, 6.0]); } #[test] fn test_reciprocal_m256() { let a = m256::from_array([1.0, 2.0, 4.0, 8.0, 0.5, 2.0, 8.0, 16.0]); let b = reciprocal_m256(a).to_array(); let expected = [1.0, 0.5, 0.25, 0.125, 2.0, 0.5, 0.125, 0.0625]; for i in 0..4 { assert!((b[i] - expected[i]).abs() < 0.001); } } #[test] fn test_round_m256d() { let a = m256d::from_array([-0.1, 1.6, 2.5, 3.1]); // assert_eq!(round_m256d::<{ round_op!(Nearest) }>(a).to_array(), [0.0, 2.0, 2.0, 3.0]); // assert_eq!(round_m256d::<{ round_op!(NegInf) }>(a).to_array(), [-1.0, 1.0, 2.0, 3.0]); // assert_eq!(round_m256d::<{ round_op!(PosInf) }>(a).to_array(), [0.0, 2.0, 3.0, 4.0]); // assert_eq!(round_m256d::<{ round_op!(Zero) }>(a).to_array(), [0.0, 1.0, 2.0, 3.0]); } #[test] fn test_round_m256() { let a = m256::from_array([-0.1, 1.6, 3.3, 4.5, 5.1, 6.5, 7.2, 8.0]); // assert_eq!(round_m256::<{ round_op!(Nearest) }>(a).to_array(), [0.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); // assert_eq!(round_m256::<{ round_op!(NegInf) }>(a).to_array(), [-1.0, 1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); // assert_eq!(round_m256::<{ round_op!(PosInf) }>(a).to_array(), [0.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 8.0]); // assert_eq!(round_m256::<{ round_op!(Zero) }>(a).to_array(), [0.0, 1.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); } #[test] fn test_reciprocal_sqrt_m256() { let a = m256::from_array([16.0, 9.0, 4.0, 25.0, 16.0, 9.0, 4.0, 25.0]); let b = reciprocal_sqrt_m256(a).to_array(); let expected = [0.25, 0.33333, 0.5, 0.2, 0.25, 0.33333, 0.5, 0.2]; for i in 0..8 { assert!((b[i] - expected[i]).abs() < 0.001); } } #[test] fn test_set_i8_m256i() { let a: [i8; 32] = set_i8_m256i(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31).into(); assert_eq!(a, [31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]); } #[test] fn test_set_i16_m256i() { let a: [i16; 16] = set_i16_m256i(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15).into(); assert_eq!(a, [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]); } #[test] fn test_set_i32_m256i() { let a: [i32; 8] = set_i32_m256i(0, 1, 2, 3, 4, 5, 6, 7).into(); assert_eq!(a, [7, 6, 5, 4, 3, 2, 1, 0]); } #[test] #[cfg(target_arch = "x86_64")] fn test_set_i64_m256i() { let a: [i64; 4] = set_i64_m256i(0, 1, 2, 3).into(); assert_eq!(a, [3, 2, 1, 0]); } #[test] fn test_set_m128_m256() { let a = set_m128_m256(m128::from([4.0, 5.0, 6.0, 7.0]), m128::from([0.0, 1.0, 2.0, 3.0])).to_array(); assert_eq!(a, [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]); } #[test] fn test_set_m128d_m256d() { let a = set_m128d_m256d(m128d::from([2.0, 3.0]), m128d::from([0.0, 1.0])).to_array(); assert_eq!(a, [0.0, 1.0, 2.0, 3.0]); } #[test] fn test_set_m128i_m256i() { let a: [i64; 4] = set_m128i_m256i(set_i64_m128i(3_i64, 2), set_i64_m128i(1_i64, 0)).into(); assert_eq!(a, [0_i64, 1, 2, 3]); } #[test] fn test_set_m256d() { let a = set_m256d(0.0, 1.0, 2.0, 3.0).to_array(); assert_eq!(a, [3.0, 2.0, 1.0, 0.0]); } #[test] fn test_set_m256() { let a = set_m256(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0).to_array(); assert_eq!(a, [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0]); } #[test] fn test_set_splat_i8_m256i() { let a: [i8; 32] = set_splat_i8_m256i(56).into(); assert_eq!(a, [56_i8; 32]); } #[test] fn test_set_splat_i16_m256i() { let a: [i16; 16] = set_splat_i16_m256i(56).into(); assert_eq!(a, [56_i16; 16]); } #[test] fn test_set_splat_i32_m256i() { let a: [i32; 8] = set_splat_i32_m256i(56).into(); assert_eq!(a, [56_i32; 8]); } #[test] #[cfg(target_arch = "x86_64")] fn test_set_splat_i64_m256i() { let a: [i64; 4] = set_splat_i64_m256i(56).into(); assert_eq!(a, [56_i64; 4]); } #[test] fn test_set_splat_m256d() { let a = set_splat_m256d(56.0).to_array(); assert_eq!(a, [56.0; 4]); } #[test] fn test_set_splat_m256() { let a = set_splat_m256(56.0).to_array(); assert_eq!(a, [56.0; 8]); } #[test] fn test_set_reversed_i8_m256i() { let a: [i8; 32] = set_reversed_i8_m256i(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31).into(); assert_eq!(a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]); } #[test] fn test_set_reversed_i16_m256i() { let a: [i16; 16] = set_reversed_i16_m256i(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15).into(); assert_eq!(a, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); } #[test] fn test_set_reversed_i32_m256i() { let a: [i32; 8] = set_reversed_i32_m256i(0, 1, 2, 3, 4, 5, 6, 7).into(); assert_eq!(a, [0, 1, 2, 3, 4, 5, 6, 7]); } #[test] #[cfg(target_arch = "x86_64")] fn test_set_reversed_i64_m256i() { let a: [i64; 4] = set_reversed_i64_m256i(0, 1, 2, 3).into(); assert_eq!(a, [0, 1, 2, 3]); } #[test] fn test_set_reversed_m128_m256() { let a = set_reversed_m128_m256(set_reversed_m128(7.0, 6.0, 5.0, 4.0), set_reversed_m128(3.0, 2.0, 1.0, 0.0)).to_array(); assert_eq!(a, [7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0]); } #[test] fn test_set_reversed_m128d_m256d() { let a = set_reversed_m128d_m256d(set_reversed_m128d(3.0, 2.0), set_reversed_m128d(1.0, 0.0)).to_array(); assert_eq!(a, [3.0, 2.0, 1.0, 0.0]); } #[test] fn test_set_reversed_m128i_m256i() { let a: [i64; 4] = set_reversed_m128i_m256i(m128i::from([0_i64, 1]), m128i::from([2_i64, 3])).into(); assert_eq!(a, [0_i64, 1, 2, 3]); } #[test] fn test_set_reversed_m256d() { let a = set_reversed_m256d(0.0, 1.0, 2.0, 3.0).to_array(); assert_eq!(a, [0.0, 1.0, 2.0, 3.0]); } #[test] fn test_set_reversed_m256() { let a = set_reversed_m256(0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0).to_array(); assert_eq!(a, [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]); } #[test] fn test_zeroed_m256d() { let a = zeroed_m256d().to_array(); assert_eq!(a, [0.0; 4]); } #[test] fn test_zeroed_m256() { let a = zeroed_m256().to_array(); assert_eq!(a, [0.0; 8]); } #[test] fn test_zeroed_m256i() { let a: [i32; 8] = zeroed_m256i().into(); assert_eq!(a, [0; 8]); } #[test] fn test_shuffle_m256d() { let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); let b = m256d::from_array([5.0, 6.0, 7.0, 8.0]); // let c = shuffle_m256d::<0b_0_0_0_0>(a, b).to_array(); assert_eq!(c, [1.0, 5.0, 3.0, 7.0]); // let c = shuffle_m256d::<0b_0_0_0_1>(a, b).to_array(); assert_eq!(c, [2.0, 5.0, 3.0, 7.0]); // let c = shuffle_m256d::<0b_0_0_1_0>(a, b).to_array(); assert_eq!(c, [1.0, 6.0, 3.0, 7.0]); // let c = shuffle_m256d::<0b_0_0_1_1>(a, b).to_array(); assert_eq!(c, [2.0, 6.0, 3.0, 7.0]); // let c = shuffle_m256d::<0b_1_0_0_1>(a, b).to_array(); assert_eq!(c, [2.0, 5.0, 3.0, 8.0]); // let c = shuffle_m256d::<0b_0_1_0_1>(a, b).to_array(); assert_eq!(c, [2.0, 5.0, 4.0, 7.0]); // let c = shuffle_m256d::<0b_1_1_1_1>(a, b).to_array(); assert_eq!(c, [2.0, 6.0, 4.0, 8.0]); } #[test] fn test_shuffle_m256() { let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); let b = m256::from_array([9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]); // let c = shuffle_m256::<0b_00_10_11_01>(a, b).to_array(); assert_eq!(c, [2.0, 4.0, 11.0, 9.0, 6.0, 8.0, 15.0, 13.0]); } #[test] fn test_sqrt_m256d() { let a = m256d::from_array([1.0, 4.0, 9.0, 16.0]); let b = sqrt_m256d(a).to_array(); assert_eq!(b, [1.0, 2.0, 3.0, 4.0]); } #[test] fn test_sqrt_m256() { let a = m256::from_array([1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 0.0, 49.0]); let b = sqrt_m256(a).to_array(); assert_eq!(b, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.0, 7.0]); } #[test] fn test_store_m256d() { let mut addr = m256d::from([0.0; 4]); store_m256d(&mut addr, m256d::from([1.0, 2.0, 3.0, 4.0])); assert_eq!(addr.to_array(), [1.0, 2.0, 3.0, 4.0]); } #[test] fn test_store_m256() { let mut addr = m256::from([0.0; 8]); store_m256(&mut addr, m256::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])); assert_eq!(addr.to_array(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); } #[test] fn test_store_m256i() { let mut addr = m256i::from([0_i32; 8]); store_m256i(&mut addr, m256i::from([1, 2, 3, 4, 5, 6, 7, 8])); assert_eq!(<[i32; 8]>::from(addr), [1, 2, 3, 4, 5, 6, 7, 8]); } #[test] fn test_store_unaligned_m256d() { let mut addr = [0.0; 4]; store_unaligned_m256d(&mut addr, m256d::from([1.0, 2.0, 3.0, 4.0])); assert_eq!(addr, [1.0, 2.0, 3.0, 4.0]); } #[test] fn test_store_unaligned_m256() { let mut addr = [0.0; 8]; store_unaligned_m256(&mut addr, m256::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])); assert_eq!(addr, [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); } #[test] fn test_store_unaligned_m256i() { let mut addr = [0_i8; 32]; store_unaligned_m256i(&mut addr, m256i::from([12_i8; 32])); assert_eq!(addr, [12_i8; 32]); } #[test] fn test_store_unaligned_hi_lo_m256d() { let mut hi_addr = [0.0; 2]; let mut lo_addr = [0.0; 2]; store_unaligned_hi_lo_m256d(&mut hi_addr, &mut lo_addr, m256d::from([1.0, 2.0, 3.0, 4.0])); assert_eq!(hi_addr, [3.0, 4.0]); assert_eq!(lo_addr, [1.0, 2.0]); } #[test] fn test_store_unaligned_hi_lo_m256() { let mut hi_addr = [0.0; 4]; let mut lo_addr = [0.0; 4]; store_unaligned_hi_lo_m256(&mut hi_addr, &mut lo_addr, m256::from([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0])); assert_eq!(hi_addr, [5.0, 6.0, 7.0, 8.0]); assert_eq!(lo_addr, [1.0, 2.0, 3.0, 4.0]); } #[test] fn test_store_unaligned_hi_lo_m256i() { let mut hi_addr = [0_i8; 16]; let mut lo_addr = [0_i8; 16]; store_unaligned_hi_lo_m256i(&mut hi_addr, &mut lo_addr, m256i::from([56_i8; 32])); assert_eq!(hi_addr, [56_i8; 16]); assert_eq!(lo_addr, [56_i8; 16]); } #[test] fn test_sub_m256d() { let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); let b = m256d::from_array([5.0, 60.0, 712.0, 8.5]); let c = sub_m256d(a, b).to_array(); assert_eq!(c, [-4.0, -58.0, -709.0, -4.5]); } #[test] fn test_sub_m256() { let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 20.0, 30.0, 40.0, 50.0]); let b = m256::from_array([59.0, 61.0, 79.0, 81.5, 90.0, 100.0, 110.0, 51.0]); let c = sub_m256(a, b).to_array(); assert_eq!(c, [-58.0, -59.0, -76.0, -77.5, -70.0, -70.0, -70.0, -1.0]); } #[test] fn test_unpack_hi_m256d() { let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); let b = m256d::from_array([59.0, 61.0, 79.0, 81.5]); let c = unpack_hi_m256d(a, b).to_array(); assert_eq!(c, [2.0, 61.0, 4.0, 81.5]); } #[test] fn test_unpack_hi_m256() { let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 20.0, 30.0, 40.0, 50.0]); let b = m256::from_array([59.0, 61.0, 79.0, 81.5, 90.0, 100.0, 110.0, 51.0]); let c = unpack_hi_m256(a, b).to_array(); assert_eq!(c, [3.0, 79.0, 4.0, 81.5, 40.0, 110.0, 50.0, 51.0]); } #[test] fn test_unpack_lo_m256d() { let a = m256d::from_array([1.0, 2.0, 3.0, 4.0]); let b = m256d::from_array([59.0, 61.0, 79.0, 81.5]); let c = unpack_lo_m256d(a, b).to_array(); assert_eq!(c, [1.0, 59.0, 3.0, 79.0]); } #[test] fn test_unpack_lo_m256() { let a = m256::from_array([1.0, 2.0, 3.0, 4.0, 20.0, 30.0, 40.0, 50.0]); let b = m256::from_array([59.0, 61.0, 79.0, 81.5, 90.0, 100.0, 110.0, 51.0]); let c = unpack_lo_m256(a, b).to_array(); assert_eq!(c, [1.0, 59.0, 2.0, 61.0, 20.0, 90.0, 30.0, 100.0]); } #[test] fn test_bitxor_m256d() { let a = m256d::from_array([1.0, 0.0, 1.0, 0.0]); let b = m256d::from_array([1.0, 1.0, 0.0, 0.0]); let c = bitxor_m256d(a, b).to_array(); assert_eq!(c, [0.0, 1.0, 1.0, 0.0]); } #[test] fn test_bitxor_m256() { let a = m256::from_array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]); let b = m256::from_array([1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0]); let c = bitxor_m256(a, b).to_array(); assert_eq!(c, [0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0]); } #[test] fn test_zero_extend_m128d() { let a = zero_extend_m128d(m128d::from_array([1.0, 2.0])).to_array(); assert_eq!(a, [1.0, 2.0, 0.0, 0.0]); } #[test] fn test_zero_extend_m128() { let a = zero_extend_m128(m128::from_array([1.0, 2.0, 3.0, 4.0])).to_array(); assert_eq!(a, [1.0, 2.0, 3.0, 4.0, 0.0, 0.0, 0.0, 0.0]); } #[test] fn test_zero_extend_m128i() { let a: [i32; 8] = zero_extend_m128i(m128i::from([1, 2, 3, 4])).into(); assert_eq!(a, [1, 2, 3, 4, 0, 0, 0, 0]); } safe_arch-0.7.1/tests/integration/bmi1_tests.rs000066400000000000000000000074651445526200400215550ustar00rootroot00000000000000use super::*; #[test] fn test_bitandnot_u32() { let a = [1, 0, 1, 0]; let b = [1, 1, 0, 0]; let mut c = [0_u32; 4]; for i in 0..4 { c[i] = bitandnot_u32(a[i], b[i]); } assert_eq!(c, [0, 1, 0, 0]); } #[test] #[cfg(target_arch = "x86_64")] fn test_bitandnot_u64() { let a = [1_u64, 0, 1, 0]; let b = [1_u64, 1, 0, 0]; let mut c = [0_u64; 4]; for i in 0..4 { c[i] = bitandnot_u64(a[i], b[i]); } assert_eq!(c, [0_u64, 1, 0, 0]); } #[test] fn test_bit_extract_u32() { assert_eq!(bit_extract_u32(0b0110, 0, 3), 0b110); assert_eq!(bit_extract_u32(0b0110, 0, 2), 0b10); assert_eq!(bit_extract_u32(0b0110, 1, 2), 0b11); } #[test] #[cfg(target_arch = "x86_64")] fn test_bit_extract_u64() { assert_eq!(bit_extract_u64(0b0110, 0, 3), 0b110); assert_eq!(bit_extract_u64(0b0110, 0, 2), 0b10); assert_eq!(bit_extract_u64(0b0110, 1, 2), 0b11); } #[test] fn test_bit_extract2_u32() { assert_eq!(bit_extract2_u32(0b0110, (3 << 8) | 0), 0b110); assert_eq!(bit_extract2_u32(0b0110, (2 << 8) | 0), 0b10); assert_eq!(bit_extract2_u32(0b0110, (2 << 8) | 1), 0b11); } #[test] #[cfg(target_arch = "x86_64")] fn test_bit_extract2_u64() { assert_eq!(bit_extract2_u64(0b0110, (3 << 8) | 0), 0b110); assert_eq!(bit_extract2_u64(0b0110, (2 << 8) | 0), 0b10); assert_eq!(bit_extract2_u64(0b0110, (2 << 8) | 1), 0b11); } #[test] fn test_bit_lowest_set_value_u32() { assert_eq!(bit_lowest_set_value_u32(0b0), 0); assert_eq!(bit_lowest_set_value_u32(0b1), 1); assert_eq!(bit_lowest_set_value_u32(0b10), 2); assert_eq!(bit_lowest_set_value_u32(0b100), 4); assert_eq!(bit_lowest_set_value_u32(0b111100), 4); } #[test] #[cfg(target_arch = "x86_64")] fn test_bit_lowest_set_value_u64() { assert_eq!(bit_lowest_set_value_u64(0b0), 0); assert_eq!(bit_lowest_set_value_u64(0b1), 1); assert_eq!(bit_lowest_set_value_u64(0b10), 2); assert_eq!(bit_lowest_set_value_u64(0b100), 4); assert_eq!(bit_lowest_set_value_u64(0b111100), 4); } #[test] fn test_bit_lowest_set_mask_u32() { assert_eq!(bit_lowest_set_mask_u32(0b0), u32::MAX); assert_eq!(bit_lowest_set_mask_u32(0b1), 0b1); assert_eq!(bit_lowest_set_mask_u32(0b10), 0b11); assert_eq!(bit_lowest_set_mask_u32(0b100), 0b111); assert_eq!(bit_lowest_set_mask_u32(0b111100), 0b111); } #[test] #[cfg(target_arch = "x86_64")] fn test_bit_lowest_set_mask_u64() { assert_eq!(bit_lowest_set_mask_u64(0b0), u64::MAX); assert_eq!(bit_lowest_set_mask_u64(0b1), 0b1); assert_eq!(bit_lowest_set_mask_u64(0b10), 0b11); assert_eq!(bit_lowest_set_mask_u64(0b100), 0b111); assert_eq!(bit_lowest_set_mask_u64(0b111100), 0b111); } #[test] fn test_bit_lowest_set_reset_u32() { assert_eq!(bit_lowest_set_reset_u32(0b0), 0); assert_eq!(bit_lowest_set_reset_u32(0b1), 0b0); assert_eq!(bit_lowest_set_reset_u32(0b10), 0b00); assert_eq!(bit_lowest_set_reset_u32(0b100), 0b000); assert_eq!(bit_lowest_set_reset_u32(0b111100), 0b111000); } #[test] #[cfg(target_arch = "x86_64")] fn test_bit_lowest_set_reset_u64() { assert_eq!(bit_lowest_set_reset_u64(0b0), 0); assert_eq!(bit_lowest_set_reset_u64(0b1), 0b0); assert_eq!(bit_lowest_set_reset_u64(0b10), 0b00); assert_eq!(bit_lowest_set_reset_u64(0b100), 0b000); assert_eq!(bit_lowest_set_reset_u64(0b111100), 0b111000); } #[test] fn test_trailing_zero_count_u32() { assert_eq!(trailing_zero_count_u32(0b0), 32); assert_eq!(trailing_zero_count_u32(0b1), 0); assert_eq!(trailing_zero_count_u32(0b10), 1); assert_eq!(trailing_zero_count_u32(0b100), 2); assert_eq!(trailing_zero_count_u32(0b111100), 2); } #[test] #[cfg(target_arch = "x86_64")] fn test_trailing_zero_count_u64() { assert_eq!(trailing_zero_count_u64(0b0), 64); assert_eq!(trailing_zero_count_u64(0b1), 0); assert_eq!(trailing_zero_count_u64(0b10), 1); assert_eq!(trailing_zero_count_u64(0b100), 2); assert_eq!(trailing_zero_count_u64(0b111100), 2); } safe_arch-0.7.1/tests/integration/bmi2_tests.rs000066400000000000000000000036121445526200400215440ustar00rootroot00000000000000use super::*; #[test] fn test_bit_zero_high_index_u32() { assert_eq!(bit_zero_high_index_u32(0b1111, 0), 0b0000); assert_eq!(bit_zero_high_index_u32(0b1111, 1), 0b0001); assert_eq!(bit_zero_high_index_u32(0b1111, 2), 0b0011); assert_eq!(bit_zero_high_index_u32(0b1111, 3), 0b0111); } #[test] #[cfg(target_arch = "x86_64")] fn test_bit_zero_high_index_u64() { assert_eq!(bit_zero_high_index_u64(0b1111, 0), 0b0000); assert_eq!(bit_zero_high_index_u64(0b1111, 1), 0b0001); assert_eq!(bit_zero_high_index_u64(0b1111, 2), 0b0011); assert_eq!(bit_zero_high_index_u64(0b1111, 3), 0b0111); } #[test] fn test_mul_extended_u32() { let mut x = 0_u32; assert_eq!(mul_extended_u32(u32::MAX, 17, &mut x), 4294967279); assert_eq!(x, 16); } #[test] #[cfg(target_arch = "x86_64")] fn test_mul_extended_u64() { let mut x = 0_u64; assert_eq!(mul_extended_u64(u64::MAX, 17, &mut x), 18446744073709551599); assert_eq!(x, 16); } #[test] fn test_population_deposit_u32() { assert_eq!(population_deposit_u32(0b1001, 0b1111), 0b1001); assert_eq!(population_deposit_u32(0b1001, 0b1110), 0b0010); assert_eq!(population_deposit_u32(0b1001, 0b1100), 0b0100); } #[test] #[cfg(target_arch = "x86_64")] fn test_population_deposit_u64() { assert_eq!(population_deposit_u64(0b1001, 0b1111), 0b1001); assert_eq!(population_deposit_u64(0b1001, 0b1110), 0b0010); assert_eq!(population_deposit_u64(0b1001, 0b1100), 0b0100); } #[test] fn test_population_extract_u32() { assert_eq!(population_extract_u32(0b1001, 0b1111), 0b1001); assert_eq!(population_extract_u32(0b1001, 0b1110), 0b0100); assert_eq!(population_extract_u32(0b1001, 0b1100), 0b0010); } #[test] #[cfg(target_arch = "x86_64")] fn test_population_extract_u64() { assert_eq!(population_extract_u64(0b1001, 0b1111), 0b1001); assert_eq!(population_extract_u64(0b1001, 0b1110), 0b0100); assert_eq!(population_extract_u64(0b1001, 0b1100), 0b0010); } safe_arch-0.7.1/tests/integration/lzcnt_tests.rs000066400000000000000000000005371445526200400220500ustar00rootroot00000000000000use super::*; #[test] fn test_leading_zero_count_u32() { assert_eq!(leading_zero_count_u32(u32::MAX), 0); assert_eq!(leading_zero_count_u32(u32::MAX >> 3), 3); } #[test] #[cfg(target_arch = "x86_64")] fn test_leading_zero_count_u64() { assert_eq!(leading_zero_count_u64(u64::MAX), 0); assert_eq!(leading_zero_count_u64(u64::MAX >> 3), 3); } safe_arch-0.7.1/tests/integration/main.rs000066400000000000000000000054651445526200400204250ustar00rootroot00000000000000#![allow(bad_style)] #![allow(unused_imports)] #![allow(clippy::identity_op)] use safe_arch::*; #[cfg(target_feature = "adx")] mod adx_tests; #[cfg(target_feature = "avx")] mod avx_tests; #[cfg(target_feature = "bmi1")] mod bmi1_tests; #[cfg(target_feature = "bmi2")] mod bmi2_tests; #[cfg(target_feature = "lzcnt")] mod lzcnt_tests; #[cfg(target_feature = "pclmulqdq")] mod pclmulqdq_tests; #[cfg(target_feature = "popcnt")] mod popcnt_tests; #[cfg(target_feature = "rdrand")] mod rdrand_tests; #[cfg(target_feature = "rdseed")] mod rdseed_tests; #[cfg(target_feature = "sse2")] mod sse2_tests; #[cfg(target_feature = "sse3")] mod sse3_tests; #[cfg(target_feature = "ssse3")] mod ssse3_tests; #[cfg(target_feature = "sse4.2")] mod sse4_2_tests; #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_m128_size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_m128d_size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_m128i_size_align() { assert_eq!(core::mem::size_of::(), 16); assert_eq!(core::mem::align_of::(), 16); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_m256_size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_m256d_size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_m256i_size_align() { assert_eq!(core::mem::size_of::(), 32); assert_eq!(core::mem::align_of::(), 32); } #[test] #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] fn test_m128_fmt() { let f = format!("{:?}", m128::default()); assert_eq!(&f, "m128(0.0, 0.0, 0.0, 0.0)"); let f = format!("{}", m128::default()); assert_eq!(&f, "(0, 0, 0, 0)"); let f = format!("{:b}", m128::default()); assert_eq!(&f, "(0, 0, 0, 0)"); let f = format!("{:e}", m128::default()); assert_eq!(&f, "(0e0, 0e0, 0e0, 0e0)"); let f = format!("{:E}", m128::default()); assert_eq!(&f, "(0E0, 0E0, 0E0, 0E0)"); let f = format!("{:x}", m128::default()); assert_eq!(&f, "(0, 0, 0, 0)"); let f = format!("{:X}", m128::default()); assert_eq!(&f, "(0, 0, 0, 0)"); let f = format!("{:o}", m128::default()); assert_eq!(&f, "(0, 0, 0, 0)"); } #[allow(dead_code)] fn approx_eq_f32(a: f32, b: f32) -> bool { (a - b).abs() < 0.00000001 } #[allow(dead_code)] fn approx_eq_f64(a: f64, b: f64) -> bool { (a - b).abs() < 0.00000000001 } safe_arch-0.7.1/tests/integration/memcpy.rs.bak000066400000000000000000000071341445526200400215220ustar00rootroot00000000000000#![cfg(feature = "nightly")] #![cfg(target_feature = "avx")] #![cfg(feature = "bytemuck")] #![feature(test)] #![feature(slice_iter_mut_as_slice)] #![feature(fixed_size_array)] use std::array::FixedSizeArray; /* const generics pls */ #[allow(unused_must_use)] #[allow(unused_variables)] /* for &[u8] -> &[u8; 16/32] conversion */ use std::convert::TryInto; use bytemuck; use safe_arch::*; fn memcpy_bytes(src: &[u8], dst: &mut [u8]) { if src.len() != dst.len() { return; } for (d, s) in dst.iter_mut().zip(src.iter()) { *d = *s; } } fn memcpy_avx(src: &[u8], dst: &mut [u8]) { if src.len() != dst.len() { return; } let (src_begin, src_meat, src_end) = bytemuck::pod_align_to(src); let mut dst_it = dst.iter_mut(); /* Order of this zip is important, as src_begin.len() <= dst.len() * and zip first checks for first iterator for None, then the second. * So, swapping them around would result in dst_it being one byte * further than it should */ for (s, d) in src_begin.iter().zip(dst_it.by_ref()) { *d = *s; } let mut dst_chunks = dst_it.into_slice().chunks_exact_mut(32); for (d, s) in dst_chunks.by_ref().zip(src_meat.iter()) { let d: &mut [i8] = bytemuck::cast_slice_mut(d); let d: &mut [i8; 32] = d.try_into().expect("Impossible!"); store_unaligned_m256i(d, *s); } memcpy_bytes(src_end, dst_chunks.into_remainder()); } fn memcpy_sse(src: &[u8], dst: &mut [u8]) { if src.len() != dst.len() { return; } let (src_begin, src_meat, src_end) = bytemuck::pod_align_to(src); let mut dst_it = dst.iter_mut(); /* Order of this zip is important, as src_begin.len() <= dst.len() * and zip first checks for first iterator for None, then the second. * So, swapping them around would result in dst_it being one byte * further than it should */ for (s, d) in src_begin.iter().zip(dst_it.by_ref()) { *d = *s; } let mut dst_chunks = dst_it.into_slice().chunks_exact_mut(16); for (d, s) in dst_chunks.by_ref().zip(src_meat.iter()) { let d: &mut [u8; 16] = d.try_into().expect("Impossible!"); store_unaligned_m128i(d, *s); } memcpy_bytes(src_end, dst_chunks.into_remainder()); } fn poor_rng(x: u64) -> u64 { let x = x ^ 0xDEADBEEFFEEBDAED; (x >> 3) | (x << 1) } fn random_bytes(n: usize) -> Vec { let mut vec = Vec::with_capacity(n); let mut rng = 0; for _ in 0..n { rng = poor_rng(rng); vec.push(rng as u8); } return vec; } #[cfg(test)] mod tests { use super::*; #[test] fn test0() { let s = b"aoisjdiouwgowimecwieohffiowejfiowenofiweiofji".as_slice(); let mut d = Vec::new(); d.resize(s.len(), 0u8); memcpy_avx(s, &mut d); assert_eq!(s, d.as_slice()); } #[test] fn test1() { let s = b"aoisjdiouwgowimecwieohffiowejfiowenofiweiofji".as_slice(); let mut d = Vec::new(); d.resize(s.len(), 0u8); memcpy_sse(s, &mut d); assert_eq!(s, d.as_slice()); } } #[cfg(test)] mod benches { extern crate test; use super::*; use test::{black_box, Bencher}; const N: usize = 1 << 20; #[bench] fn bench_memcpy_avx(b: &mut Bencher) { let from = random_bytes(N); let mut into = Vec::new(); into.resize(N, 0u8); b.iter(|| { let mut a = black_box(into[0]); memcpy_avx(&from, &mut into); a += into[0]; let _b = black_box(a); }); } #[bench] fn bench_memcpy_bytes(b: &mut Bencher) { let from = random_bytes(N); let mut into = Vec::new(); into.resize(N, 0u8); b.iter(|| { let mut a = black_box(into[0]); memcpy_bytes(&from, &mut into); a += into[0]; let _b = black_box(a); }); } } safe_arch-0.7.1/tests/integration/pclmulqdq_tests.rs000066400000000000000000000011441445526200400227130ustar00rootroot00000000000000use super::*; #[test] fn test_mul_i64_carryless_m128i() { let x = m128i::from([2_i64, 3]); let y = m128i::from([4_i64, 500]); // let c: [i64; 2] = mul_i64_carryless_m128i::<{ 0 | (0 << 4) }>(x, y).into(); assert_eq!(c, [8_i64, 0]); let c: [i64; 2] = mul_i64_carryless_m128i::<{ 1 | (0 << 4) }>(x, y).into(); assert_eq!(c, [12_i64, 0]); let c: [i64; 2] = mul_i64_carryless_m128i::<{ 0 | (1 << 4) }>(x, y).into(); assert_eq!(c, [1000_i64, 0]); let c: [i64; 2] = mul_i64_carryless_m128i::<{ 1 | (1 << 4) }>(x, y).into(); assert_eq!(c, [540_i64, 0]); // not 1500 like a normal mul would be! } safe_arch-0.7.1/tests/integration/popcnt_tests.rs000066400000000000000000000006171445526200400222200ustar00rootroot00000000000000use super::*; #[test] fn test_population_count_i32() { assert_eq!(population_count_i32(0), 0); assert_eq!(population_count_i32(0b1), 1); assert_eq!(population_count_i32(0b1001), 2); } #[test] #[cfg(target_arch = "x86_64")] fn test_population_count_i64() { assert_eq!(population_count_i64(0), 0); assert_eq!(population_count_i64(0b1), 1); assert_eq!(population_count_i64(0b1001), 2); } safe_arch-0.7.1/tests/integration/pythagoras.rs.bak000066400000000000000000000134471445526200400224150ustar00rootroot00000000000000#![cfg(target_feature = "avx")] #![cfg(feature = "bytemuck")] #[cfg_attr(feature = "nightly", feature(test))] #[allow(unused_must_use)] #[allow(unused_variables)] mod definitions { pub type Point2D = [f32; 2]; pub const PLAYER_POS: Point2D = [128.0, 128.0]; pub const MAX_DISTANCE: f32 = 16.0; } #[cfg(target_feature = "sse3")] mod sse { use super::{definitions::*, scalar}; use bytemuck; use safe_arch::*; fn sub_and_square(xyxy: m128, player_pos: m128) -> m128 { let xyxy = xyxy - player_pos; xyxy * xyxy } fn is_close(xyxy: m128, max_distance: m128) -> i32 { let results = cmp_lt_mask_m128(xyxy, max_distance); return move_mask_m128(results); } pub fn objects_close(x: &[Point2D]) -> usize { let player_pos: m128 = [PLAYER_POS[0], PLAYER_POS[1], PLAYER_POS[0], PLAYER_POS[1]].into(); let max_distances = load_f32_splat_m128(&MAX_DISTANCE); let max_distances_squared = max_distances * max_distances; let (begin, meat, end) = bytemuck::pod_align_to(x); let mut it = meat.chunks_exact(2); let mut result = scalar::objects_close(begin); for chunk in it.by_ref() { let distances_squared = add_horizontal_m128( sub_and_square(chunk[0], player_pos), sub_and_square(chunk[1], player_pos), ); let results = is_close(distances_squared, max_distances_squared); result += results.count_ones() as usize; } if let Some(remainder) = it.remainder().get(0) { let xyxy = sub_and_square(*remainder, player_pos); let distances_squared = add_horizontal_m128(xyxy, xyxy); let results = is_close(distances_squared, max_distances_squared); result += results.count_ones() as usize / 2; } return result + scalar::objects_close(end); } #[test] fn test_points_pythagoras() { use super::testutils::*; let mut rng = 0; for _ in 0..128 { rng = poor_rng(rng); let pos = random_positions(rng as usize); assert_eq!(scalar::objects_close(&pos), objects_close(&pos)); } } } #[cfg(target_feature = "avx")] mod avx { use super::{definitions::*, scalar}; use bytemuck; use safe_arch::*; fn sub_and_square(xyxyxyxy: m256, player_pos: m256) -> m256 { let xyxyxyxy = xyxyxyxy - player_pos; xyxyxyxy * xyxyxyxy } fn is_close(xyxyxyxy: m256, max_distance: m256) -> i32 { let results = cmp_op_mask_m256!(xyxyxyxy, LessThanOrdered, max_distance); return move_mask_m256(results); } pub fn objects_close(x: &[Point2D]) -> usize { let player_pos: m256 = [ PLAYER_POS[0], PLAYER_POS[1], PLAYER_POS[0], PLAYER_POS[1], PLAYER_POS[0], PLAYER_POS[1], PLAYER_POS[0], PLAYER_POS[1], ] .into(); let max_distances = load_f32_splat_m256(&MAX_DISTANCE); let max_distances_squared = max_distances * max_distances; let (begin, meat, end) = bytemuck::pod_align_to(x); let mut it = meat.chunks_exact(2); let mut result = scalar::objects_close(begin); for chunk in it.by_ref() { let distances_squared = add_horizontal_m256( sub_and_square(chunk[0], player_pos), sub_and_square(chunk[1], player_pos), ); let results = is_close(distances_squared, max_distances_squared); result += results.count_ones() as usize; } if let Some(remainder) = it.remainder().get(0) { let xyxy = sub_and_square(*remainder, player_pos); let distances_squared = add_horizontal_m256(xyxy, xyxy); let results = is_close(distances_squared, max_distances_squared); result += results.count_ones() as usize / 2; } return result + scalar::objects_close(end); } #[test] fn test_points_pythagoras() { use super::testutils::*; let mut rng = 0; for _ in 0..128 { rng = poor_rng(rng); let pos = random_positions(rng as usize); assert_eq!(scalar::objects_close(&pos), objects_close(&pos)); } } } pub mod scalar { use super::definitions::*; fn sub_and_square(xy: Point2D, player_pos: Point2D) -> Point2D { let xy = [xy[0] - player_pos[0], xy[1] - player_pos[1]]; return [xy[0] * xy[0], xy[1] * xy[1]]; } fn is_close(xy: &Point2D) -> bool { let squared = sub_and_square(*xy, PLAYER_POS); let distance_squared = squared[0] + squared[1]; return distance_squared < MAX_DISTANCE * MAX_DISTANCE; } pub fn objects_close(x: &[Point2D]) -> usize { x.iter().copied().filter(is_close).count() } } pub mod testutils { use super::definitions::*; pub fn poor_rng(x: u16) -> u16 { let x = x ^ 0xC0DE; (x >> 3) | (x << 1) } pub fn random_positions(n: usize) -> Vec { let mut vec = Vec::with_capacity(n); let mut rng = 0; let mut pos: Point2D = Default::default(); for _ in 0..n { for i in 0..2 { rng = poor_rng(rng); pos[i] = rng as f32 / 256.0; } vec.push(pos); } return vec; } } #[cfg(feature = "nightly")] #[cfg(test)] mod benches { const N: usize = 1 << 20; extern crate test; use super::{definitions::*, testutils::*}; use test::{black_box, Bencher}; #[bench] fn bench_scalar_objects_close(b: &mut Bencher) { use super::scalar::*; let pos = random_positions(N); b.iter(|| { let mut x = black_box(0); x += objects_close(&pos); let _n = black_box(x); }); } #[cfg(target_feature = "sse")] #[bench] fn bench_sse_objects_close(b: &mut Bencher) { use super::sse::*; let pos = random_positions(N); b.iter(|| { let mut x = black_box(0); x += objects_close(&pos); let _n = black_box(x); }); } #[cfg(target_feature = "avx")] #[bench] fn bench_avx_objects_close(b: &mut Bencher) { use super::avx::*; let pos = random_positions(N); b.iter(|| { let mut x = black_box(0); x += objects_close(&pos); let _n = black_box(x); }); } } safe_arch-0.7.1/tests/integration/rdrand_tests.rs000066400000000000000000000010241445526200400221600ustar00rootroot00000000000000use super::*; // Note(Lokathor): It's technically possible, and valid, that these could fail // when run just once. However, if they fail across multiple test runs then // *that* is when we have a problem. #[test] fn test_rdrand_u16() { let mut val = 0_u16; let _it_worked = rdrand_u16(&mut val); } #[test] fn test_rdrand_u32() { let mut val = 0_u32; let _it_worked = rdrand_u32(&mut val); } #[test] #[cfg(target_arch = "x86_64")] fn test_rdrand_u64() { let mut val = 0_u64; let _it_worked = rdrand_u64(&mut val); } safe_arch-0.7.1/tests/integration/rdseed_tests.rs000066400000000000000000000010241445526200400221540ustar00rootroot00000000000000use super::*; // Note(Lokathor): It's technically possible, and valid, that these could fail // when run just once. However, if they fail across multiple test runs then // *that* is when we have a problem. #[test] fn test_rdseed_u16() { let mut val = 0_u16; let _it_worked = rdseed_u16(&mut val); } #[test] fn test_rdseed_u32() { let mut val = 0_u32; let _it_worked = rdseed_u32(&mut val); } #[test] #[cfg(target_arch = "x86_64")] fn test_rdseed_u64() { let mut val = 0_u64; let _it_worked = rdseed_u64(&mut val); } safe_arch-0.7.1/tests/integration/sse2_tests.rs000066400000000000000000000000151445526200400215610ustar00rootroot00000000000000use super::*;safe_arch-0.7.1/tests/integration/sse3_tests.rs000066400000000000000000000036031445526200400215700ustar00rootroot00000000000000use super::*; #[test] fn test_addsub_m128d() { let a = m128d::from_array([10.0, 50.0]); let b = m128d::from_array([100.0, 500.0]); let c = addsub_m128d(a, b).to_array(); assert_eq!(c, [-90.0, 550.0]); } #[test] fn test_addsub_m128() { let a = m128::from_array([10.0, 20.0, 30.0, 40.0]); let b = m128::from_array([100.0, 200.0, 300.0, 400.0]); let c = addsub_m128(a, b).to_array(); assert_eq!(c, [-90.0, 220.0, -270.0, 440.0]); } #[test] fn test_add_horizontal_m128d() { let a = m128d::from_array([10.0, 50.0]); let b = m128d::from_array([100.0, 500.0]); let c = add_horizontal_m128d(a, b).to_array(); assert_eq!(c, [60.0, 600.0]); } #[test] fn test_add_horizontal_m128() { let a = m128::from_array([10.0, 20.0, 30.0, 40.0]); let b = m128::from_array([100.0, 200.0, 300.0, 400.0]); let c = add_horizontal_m128(a, b).to_array(); assert_eq!(c, [30.0, 70.0, 300.0, 700.0]); } #[test] fn test_sub_horizontal_m128d() { let a = m128d::from_array([10.0, 50.0]); let b = m128d::from_array([100.0, 500.0]); let c = sub_horizontal_m128d(a, b).to_array(); assert_eq!(c, [-40.0, -400.0]); } #[test] fn test_sub_horizontal_m128() { let a = m128::from_array([10.0, 20.0, 30.0, 45.0]); let b = m128::from_array([100.0, 200.0, 300.0, 450.0]); let c = sub_horizontal_m128(a, b).to_array(); assert_eq!(c, [-10.0, -15.0, -100.0, -150.0]); } #[test] fn test_duplicate_low_lane_m128d_s() { let a = m128d::from_array([1.0, 2.0]); let b = duplicate_low_lane_m128d_s(a); assert_eq!(b.to_array(), [1.0, 1.0]); } #[test] fn test_duplicate_odd_lanes_m128() { let a = m128::from_array([0.0, 1.0, 2.0, 3.0]); let b = duplicate_odd_lanes_m128(a); assert_eq!(b.to_array(), [1.0, 1.0, 3.0, 3.0]); } #[test] fn test_duplicate_even_lanes_m128() { let a = m128::from_array([0.0, 1.0, 2.0, 3.0]); let b = duplicate_even_lanes_m128(a); assert_eq!(b.to_array(), [0.0, 0.0, 2.0, 2.0]); } safe_arch-0.7.1/tests/integration/sse4_1_tests.rs000066400000000000000000000201611445526200400220070ustar00rootroot00000000000000use super::*; #[test] fn test_blend_imm_i16_m128i() { let a = m128i::from([0_i16, 1, 2, 3, 4, 5, 6, 7]); let b = m128i::from([0_i16, -1, -2, -3, -4, -5, -6, -7]); // let c: [i16; 8] = blend_imm_i16_m128i::<0b1111_0110>(a, b).into(); assert_eq!(c, [0_i16, -1, -2, 3, -4, -5, -6, -7]); } #[test] fn test_blend_imm_m128d() { let a = m128d::from_array([0.0, 1.0]); let b = m128d::from_array([2.0, 3.0]); let c = blend_imm_m128d::<0b10>(a, b).to_array(); assert_eq!(c, [0.0, 3.0]); } #[test] fn test_blend_imm_m128() { let a = m128::from_array([0.0, 1.0, 2.0, 3.0]); let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); let c = blend_imm_m128::<0b0110>(a, b).to_array(); assert_eq!(c, [0.0, 5.0, 6.0, 3.0]); } #[test] fn test_blend_varying_i8_m128i() { let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); let b = m128i::from([ 0_i8, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, ]); let mask = m128i::from([0_i8, -1, -1, 0, 0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, 0]); let c: [i8; 16] = blend_varying_i8_m128i(a, b, mask).into(); assert_eq!(c, [0, -1, -2, 3, 4, 5, -6, -7, -8, 9, 10, 11, -12, -13, -14, 15]); } #[test] fn test_blend_varying_m128d() { let a = m128d::from_array([0.0, 1.0]); let b = m128d::from_array([2.0, 3.0]); let mask = m128d::from_array([-1.0, 0.0]); let c = blend_varying_m128d(a, b, mask).to_array(); assert_eq!(c, [2.0, 1.0]); } #[test] fn test_blend_varying_m128() { let a = m128::from_array([0.0, 1.0, 2.0, 3.0]); let b = m128::from_array([4.0, 5.0, 6.0, 7.0]); let mask = m128::from_array([-1.0, 0.0, -1.0, 0.0]); let c = blend_varying_m128(a, b, mask).to_array(); assert_eq!(c, [4.0, 1.0, 6.0, 3.0]); } #[test] fn test_ceil_m128d() { let a = m128d::from_array([-0.1, 1.8]); assert_eq!(ceil_m128d(a).to_array(), [0.0, 2.0]); } #[test] fn test_ceil_m128() { let a = m128::from_array([-0.1, 1.8, 2.5, 3.0]); assert_eq!(ceil_m128(a).to_array(), [0.0, 2.0, 3.0, 3.0]); } #[test] fn test_ceil_m128d_s() { let a = m128d::from_array([-0.1, 1.8]); let b = m128d::from_array([2.5, 3.0]); assert_eq!((a, b).to_array(), [3.0, 1.8]); } #[test] fn test_ceil_m128_s() { let a = m128::from_array([-0.1, 1.8, 5.0, 6.0]); let b = m128::from_array([2.5, 3.0, 10.0, 20.0]); assert_eq!(ceil_m128_s(a, b).to_array(), [3.0, 1.8, 5.0, 6.0]); } #[test] fn test_cmp_eq_mask_i64_m128i() { let a = m128i::from([5_i64, 6_i64]); let b = m128i::from([5_i64, 7_i64]); let c: [i64; 2] = cmp_eq_mask_i64_m128i(a, b).into(); assert_eq!(c, [-1_i64, 0]); } #[test] fn test_convert_to_i32_m128i_from_lower4_i16_m128i() { let a = m128i::from([0_i16, -1, 2, -3, 4, 5, 6, 7]); let c: [i32; 4] = convert_to_i32_m128i_from_lower4_i16_m128i(a).into(); assert_eq!(c, [0, -1, 2, -3]); } #[test] fn test_convert_to_i16_m128i_from_lower2_i16_m128i() { let a = m128i::from([0_i16, -1, 2, -3, 4, 5, 6, 7]); let c: [i64; 2] = convert_to_i16_m128i_from_lower2_i16_m128i(a).into(); assert_eq!(c, [0, -1]); } #[test] fn test_convert_to_i64_m128i_from_lower2_i32_m128i() { let a = m128i::from([0, -1, 2, -3]); let c: [i64; 2] = convert_to_i64_m128i_from_lower2_i32_m128i(a).into(); assert_eq!(c, [0, -1]); } #[test] fn test_convert_to_i16_m128i_from_lower8_i8_m128i() { let a = m128i::from([0_i8, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15]); let c: [i16; 8] = convert_to_i16_m128i_from_lower8_i8_m128i(a).into(); assert_eq!(c, [0_i16, -1, 2, -3, 4, -5, 6, -7]); } #[test] fn test_convert_to_i32_m128i_from_lower4_i8_m128i() { let a = m128i::from([0_i8, -1, 2, -3, 4, -5, 6, -7, 8, 9, 10, 11, 12, 13, 14, 15]); let c: [i32; 4] = convert_to_i32_m128i_from_lower4_i8_m128i(a).into(); assert_eq!(c, [0, -1, 2, -3]); } #[test] fn test_convert_to_u32_m128i_from_lower4_u16_m128i() { let a = m128i::from([u16::MAX, 1, 2, 3, 4, 5, 6, 7]); let c: [u32; 4] = convert_to_u32_m128i_from_lower4_u16_m128i(a).into(); assert_eq!(c, [u16::MAX as u32, 1, 2, 3]); } #[test] fn test_convert_to_u64_m128i_from_lower2_u16_m128i() { let a = m128i::from([u16::MAX, 1, 2, 3, 4, 5, 6, 7]); let c: [u64; 2] = convert_to_u64_m128i_from_lower2_u16_m128i(a).into(); assert_eq!(c, [u16::MAX as u64, 1]); } #[test] fn test_convert_to_u64_m128i_from_lower2_u32_m128i() { let a = m128i::from([u32::MAX, 1, 2, 3]); let c: [u64; 2] = convert_to_u64_m128i_from_lower2_u32_m128i(a).into(); assert_eq!(c, [u32::MAX as u64, 1]); } #[test] fn test_convert_to_u16_m128i_from_lower8_u8_m128i() { let a = m128i::from([u8::MAX, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); let c: [u16; 8] = convert_to_u16_m128i_from_lower8_u8_m128i(a).into(); assert_eq!(c, [u8::MAX as u16, 1, 2, 3, 4, 5, 6, 7]); } #[test] fn test_convert_to_u32_m128i_from_lower4_u8_m128i() { let a = m128i::from([u8::MAX, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); let c: [u32; 4] = convert_to_u32_m128i_from_lower4_u8_m128i(a).into(); assert_eq!(c, [u8::MAX as u32, 1, 2, 3]); } #[test] fn test_convert_to_u32_m128i_from_lower4_u8_m128i() { let a = m128i::from([u8::MAX, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); let c: [u64; 2] = convert_to_u64_m128i_from_lower2_u8_m128i(a).into(); assert_eq!(c, [u8::MAX as u64, 1]); } #[test] fn test_dot_product_m128d() { let a = m128d::from_array([1.0, 2.0]); let b = m128d::from_array([3.0, 4.0]); // let c = dot_product_m128d::<0b0000_0011>(a, b).to_array(); assert_eq!(c, [0.0, 0.0]); // no mul let c = dot_product_m128d::<0b0001_0011>(a, b).to_array(); assert_eq!(c, [3.0, 3.0]); // mul lane 0 (1 * 3) let c = dot_product_m128d::<0b0010_0011>(a, b).to_array(); assert_eq!(c, [8.0, 8.0]); // mul lane 1 (2 * 4) let c = dot_product_m128d::<0b0011_0011>(a, b).to_array(); assert_eq!(c, [11.0, 11.0]); // mul both lanes (and summed in the next step) // After here we have two temp lanes, which get added to form `sum`. let c = dot_product_m128d::<0b0011_0000>(a, b).to_array(); assert_eq!(c, [0.0, 0.0]); // never use sum let c = dot_product_m128d::<0b0011_0001>(a, b).to_array(); assert_eq!(c, [11.0, 0.0]); // sum in output lane 0 let c = dot_product_m128d::<0b0011_0010>(a, b).to_array(); assert_eq!(c, [0.0, 11.0]); // sum in output lane 1 let c = dot_product_m128d::<0b0011_0011>(a, b).to_array(); assert_eq!(c, [11.0, 11.0]); // sum in both output lanes. } #[test] fn test_dot_product_m128d() { let a = m128::from_array([1.0, 2.0, 3.0, 4.0]); let b = m128::from_array([5.0, 6.0, 7.0, 8.0]); // let c = dot_product_m128::<0b0000_1111>(a, b).to_array(); assert_eq!(c, [0.0, 0.0, 0.0, 0.0]); // no mul let c = dot_product_m128::<0b0001_1111>(a, b).to_array(); assert_eq!(c, [5.0, 5.0, 5.0, 5.0]); // mul temp lane 0 (1 * 5) let c = dot_product_m128::<0b0010_1111>(a, b).to_array(); assert_eq!(c, [12.0, 12.0, 12.0, 12.0]); // mul temp lane 1 (2 * 6) let c = dot_product_m128::<0b0100_1111>(a, b).to_array(); assert_eq!(c, [21.0, 21.0, 21.0, 21.0]); // mul temp lane 2 (3 * 7) let c = dot_product_m128::<0b1000_1111>(a, b).to_array(); assert_eq!(c, [32.0, 32.0, 32.0, 32.0]); // mul temp lane 3 (4 * 8) let c = dot_product_m128::<0b1111_1111>(a, b).to_array(); assert_eq!(c, [70.0, 70.0, 70.0, 70.0]); // mul all lanes (and summed in the next step) // After here we have four temp lanes, which get added to form `sum`. let c = dot_product_m128::<0b1111_0000>(a, b).to_array(); assert_eq!(c, [0.0, 0.0, 0.0, 0.0]); // never use sum let c = dot_product_m128::<0b1111_0001>(a, b).to_array(); assert_eq!(c, [70.0, 0.0, 0.0, 0.0]); // sum in output lane 0 let c = dot_product_m128::<0b1111_0010>(a, b).to_array(); assert_eq!(c, [0.0, 70.0, 0.0, 0.0]); // sum in output lane 1 let c = dot_product_m128::<0b1111_0100>(a, b).to_array(); assert_eq!(c, [0.0, 0.0, 70.0, 0.0]); // sum in output lane 2 let c = dot_product_m128::<0b1111_1000>(a, b).to_array(); assert_eq!(c, [0.0, 0.0, 0.0, 70.0]); // sum in output lane 3 let c = dot_product_m128::<0b1111_1111>(a, b).to_array(); assert_eq!(c, [70.0, 70.0, 70.0, 70.0]); // sum in all output lanes } #[test] fn test_extract_i32_imm_m128i() { let a = m128i::from([5, 6, 7, 8]); assert_eq!(extract_i32_imm_m128i::<1>(a), 6); } safe_arch-0.7.1/tests/integration/sse4_2_tests.rs000066400000000000000000000110111445526200400220020ustar00rootroot00000000000000use super::*; #[test] fn test_cmp_gt_mask_i64_m128i() { let a = m128i::from([1_i64, 3]); let b = m128i::from([0_i64, 3]); let c: [i64; 2] = cmp_gt_mask_i64_m128i(a, b).into(); assert_eq!(c, [-1_i64, 0]); } #[test] fn test_crc32_u8() { assert_eq!(crc32_u8(u32::MAX, u8::MAX), 16777215_u32); } #[test] fn test_crc32_u16() { assert_eq!(crc32_u16(u32::MAX, u16::MAX), 65535_u32); } #[test] fn test_crc32_u32() { assert_eq!(crc32_u32(u32::MAX, u32::MAX), 0_u32); } #[test] #[cfg(target_arch = "x86_64")] fn test_crc32_u64() { assert_eq!(crc32_u64(u64::MAX, u64::MAX), 3080238136_u64); } #[test] #[rustfmt::skip] #[cfg(target_arch = "x86_64")] fn test_search_implicit_str_for_index() { // Eq Any let needle: m128i = m128i::from(*b"e_______________"); let hay: m128i = m128i::from(*b"some test words."); assert_eq!(3, search_implicit_str_for_index::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_FIRST_MATCH}>(needle, hay)); assert_eq!(6, search_implicit_str_for_index::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_LAST_MATCH}>(needle, hay)); assert_eq!(3, search_explicit_str_for_index::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_FIRST_MATCH}>(needle, 1, hay, 16)); assert_eq!(6, search_explicit_str_for_index::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_LAST_MATCH}>(needle, 1, hay, 16)); // more than one needle character will match any of them, though we // don't get info about _which_ needle character matched. let needle: m128i = m128i::from(*b"et\0_____________"); assert_eq!(3, search_implicit_str_for_index::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_FIRST_MATCH}>(needle, hay)); assert_eq!(8, search_implicit_str_for_index::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_LAST_MATCH}>(needle, hay)); // Cmp Ranges let hay: m128i = m128i::from(*b"some test words."); let needle: m128i = m128i::from(*b"vz\0_____________"); assert_eq!(10, search_implicit_str_for_index::<{STR_CMP_U8|STR_CMP_RANGES|STR_CMP_FIRST_MATCH}>(needle, hay)); // Cmp Eq Each let hay: m128i = m128i::from(*b"some test words."); let needle: m128i = m128i::from(*b"_____test_______"); assert_eq!(5, search_implicit_str_for_index::<{STR_CMP_U8|STR_CMP_EQ_EACH|STR_CMP_FIRST_MATCH}>(needle, hay)); assert_eq!(8, search_implicit_str_for_index::<{STR_CMP_U8|STR_CMP_EQ_EACH|STR_CMP_LAST_MATCH}>(needle, hay)); // Cmp Eq Ordered let hay: m128i = m128i::from(*b"some test words."); let needle: m128i = m128i::from(*b"words\0__________"); assert_eq!(10, search_implicit_str_for_index::<{STR_CMP_U8|STR_CMP_EQ_ORDERED|STR_CMP_FIRST_MATCH}>(needle, hay)); } #[test] #[rustfmt::skip] #[cfg(target_arch = "x86_64")] fn test_search_implicit_str_for_mask() { // EqAny let hay: m128i = m128i::from(*b"some test words."); // explicit needle length let needle: m128i = m128i::from(*b"e_______________"); let i: u128 = search_explicit_str_for_mask::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_BIT_MASK}>(needle, 1, hay, 16).into(); assert_eq!(i, 0b0000000001001000); let i: [i8; 16] = search_explicit_str_for_mask::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_UNIT_MASK}>(needle, 1, hay, 16).into(); assert_eq!(i, [0, 0, 0, -1, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0]); // implicit needle length let needle: m128i = m128i::from(*b"e\0______________"); let i: u128 = search_implicit_str_for_mask::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_BIT_MASK}>(needle, hay).into(); assert_eq!(i, 0b0000000001001000); // more than one needle character will match any of them, though we // don't get info about _which_ needle character matched. let needle: m128i = m128i::from(*b"et\0_____________"); let i: u128 = search_implicit_str_for_mask::<{STR_CMP_U8|STR_CMP_EQ_ANY|STR_CMP_BIT_MASK}>(needle, hay).into(); assert_eq!(i, 0b0000000101101000); // Cmp Ranges let hay: m128i = m128i::from(*b"some test words."); let needle: m128i = m128i::from(*b"am\0_____________"); let i: u128 = search_implicit_str_for_mask::<{STR_CMP_U8|STR_CMP_RANGES|STR_CMP_BIT_MASK}>(needle, hay).into(); assert_eq!(i, 0b0010000001001100); //Cmp Eq Each let hay: m128i = m128i::from(*b"some test words."); let needle: m128i = m128i::from(*b"_____test_______"); let i: u128 = search_implicit_str_for_mask::<{STR_CMP_U8|STR_CMP_EQ_EACH|STR_CMP_BIT_MASK}>(needle, hay).into(); assert_eq!(i, 0b0000000111100000); // Cmp Eq Ordered let hay: m128i = m128i::from(*b"some test words."); let needle: m128i = m128i::from(*b"words\0__________"); let i: u128 = search_implicit_str_for_mask::<{STR_CMP_U8|STR_CMP_EQ_ORDERED|STR_CMP_BIT_MASK}>(needle, hay).into(); assert_eq!(i, 0b00000010000000000); // one bit at the start of the match } safe_arch-0.7.1/tests/integration/ssse3_tests.rs000066400000000000000000000127471445526200400217640ustar00rootroot00000000000000use super::*; #[test] fn atoi_test() { fn atoi(x: [u8; 16]) -> u64 { let ascii_zero = set_splat_i8_m128i(b'0' as i8); let x: m128i = x.into(); let x = sub_i8_m128i(x, ascii_zero); let tens = set_splat_i16_m128i(1 << 8 | 10); let x = mul_u8i8_add_horizontal_saturating_m128i(x, tens); /* eeee macarena! */ let tens = set_splat_i32_m128i(1 << 16 | 100); let x = mul_i16_horizontal_add_m128i(x, tens); let tens = set_i16_m128i(0, 0, 0, 0, 1, 10000, 1, 10000); let x = pack_i32_to_u16_m128i(x, x); let x = mul_i16_horizontal_add_m128i(x, tens); let x: [u32; 4] = x.into(); x[1] as u64 + x[0] as u64 * 100000000 } assert_eq!(atoi(*b"1234567812345678"), 1234567812345678); assert_eq!(atoi(*b"0000000000000000"), 0000000000000000); assert_eq!(atoi(*b"1982379879823749"), 1982379879823749); } #[test] fn test_abs_i8_m128i() { let a = m128i::from([0_i8, -1, 2, -3, 4, -5, 6, -7, -8, 9, -10, 11, -12, 13, -14, -128]); let c: [i8; 16] = abs_i8_m128i(a).into(); assert_eq!(c, [0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -128]); } #[test] fn test_abs_i16_m128i() { let a = m128i::from([0_i16, 1, 2, 3, 4, 5, 6, i16::MIN]); let c: [i16; 8] = abs_i16_m128i(a).into(); assert_eq!(c, [0_i16, 1, 2, 3, 4, 5, 6, i16::MIN]); } #[test] fn test_abs_i32_m128i() { let a = m128i::from([0, -1, 2, i32::MIN]); let c: [i32; 4] = abs_i32_m128i(a).into(); assert_eq!(c, [0, 1, 2, i32::MIN]); } #[test] fn test_combined_byte_shr_imm_m128i() { let a = m128i::from([0_i8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]); let b = m128i::from([16_i8, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31]); // `a` bytes come in to the high indexes because these are LE bytes. let c: [i8; 16] = combined_byte_shr_imm_m128i::<3>(a, b).into(); assert_eq!(c, [19_i8, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 0, 1, 2]); // If you feed the same register to both sides it becomes a rotate let c: [i8; 16] = combined_byte_shr_imm_m128i::<3>(a, a).into(); assert_eq!(c, [3_i8, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,]); } #[test] fn test_add_horizontal_i16_m128i() { let a = m128i::from([1_i16, 2, 3, 4, -1, -2, -3, -4]); let b = m128i::from([5_i16, 6, 7, 8, -15, -26, -37, 48]); let c: [i16; 8] = add_horizontal_i16_m128i(a, b).into(); assert_eq!(c, [3, 7, -3, -7, 11, 15, -41, 11]); } #[test] fn test_add_horizontal_i32_m128i() { let a = m128i::from([1, 2, 3, 4]); let b = m128i::from([5, 6, 7, 8]); let c: [i32; 4] = add_horizontal_i32_m128i(a, b).into(); assert_eq!(c, [3, 7, 11, 15]); } #[test] fn test_add_horizontal_saturating_i16_m128i() { let a = m128i::from([i16::MAX, i16::MAX, 3, 4, -1, -2, -3, -4]); let b = m128i::from([5_i16, 6, 7, 8, -15, -26, -37, 48]); let c: [i16; 8] = add_horizontal_saturating_i16_m128i(a, b).into(); assert_eq!(c, [i16::MAX, 7, -3, -7, 11, 15, -41, 11]); } #[test] fn test_sub_horizontal_i16_m128i() { let a = m128i::from([1_i16, 29, 3, 64, -18, -23, -73, -14]); let b = m128i::from([50_i16, 76, 72, 89, -15, -26, -37, 48]); let c: [i16; 8] = sub_horizontal_i16_m128i(a, b).into(); assert_eq!(c, [-28, -61, 5, -59, -26, -17, 11, -85]); } #[test] fn test_sub_horizontal_i32_m128i() { let a = m128i::from([1, 29, 3, 42]); let b = m128i::from([5, 96, 7, 84]); let c: [i32; 4] = sub_horizontal_i32_m128i(a, b).into(); assert_eq!(c, [-28, -39, -91, -77]); } #[test] fn test_sub_horizontal_saturating_i16_m128i() { let a = m128i::from([i16::MIN, 1, 3, 49, -1, -27, -3, -412]); let b = m128i::from([5_i16, 699, 7, 877, -15, -2664, -37, 4008]); let c: [i16; 8] = sub_horizontal_saturating_i16_m128i(a, b).into(); assert_eq!(c, [i16::MIN, -46, 26, 409, -694, -870, 2649, -4045]); } #[test] fn test_mul_u8i8_add_horizontal_saturating_m128i() { let a = m128i::from([255_u8, 255, 0, 0, 255, 255, 1, 1, 8, 9, 10, 11, 12, 13, 14, 15]); let b = m128i::from([127_i8, 127, 0, 0, -127, -127, 1, 1, 24, 25, 26, 27, 28, 29, 30, 31]); let c: [i16; 8] = mul_u8i8_add_horizontal_saturating_m128i(a, b).into(); assert_eq!(c, [i16::MAX, 0, i16::MIN, 2, 417, 557, 713, 885]); } #[test] fn test_mul_i16_scale_round_m128i() { let a = m128i::from([0_i16, 100, 200, 300, 400, 500, 600, 700]); let b = m128i::from([800_i16, 900, 1000, 1100, 1200, 1300, 1400, 1500]); let c: [i16; 8] = mul_i16_scale_round_m128i(a, b).into(); assert_eq!(c, [0, 3, 6, 10, 15, 20, 26, 32]); } #[test] fn test_shuffle_av_i8z_all_m128i() { let a = m128i::from([70_i8, 1, 2, 3, 4, 5, 6, 7, 8, 99, 100, 11, 12, 13, 14, 55]); let v = m128i::from([-1_i8, 5, 4, 1, 3, 0, 9, 10, 2, 14, 6, 7, 15, 12, 13, 8]); let c: [i8; 16] = shuffle_av_i8z_all_m128i(a, v).into(); assert_eq!(c, [0_i8, 5, 4, 1, 3, 70, 99, 100, 2, 14, 6, 7, 55, 12, 13, 8]); } #[test] fn test_sign_apply_i8_m128i() { let a = m128i::from([0_i8, 1, -2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, -15]); let b = m128i::from([-1_i8, 1, 1, -1, -1, 1, 1, 1, 1, 0, 0, -1, -1, 0, 0, 1]); let c: [i8; 16] = sign_apply_i8_m128i(a, b).into(); assert_eq!(c, [0_i8, 1, -2, -3, -4, 5, 6, 7, 8, 0, 0, -11, -12, 0, 0, -15]); } #[test] fn test_sign_apply_i16_m128i() { let a = m128i::from([1_i16, 2, -3, 4, 5, 6, 7, 8]); let b = m128i::from([5_i16, -6, 7, 0, 1, 1, 0, 1]); let c: [i16; 8] = sign_apply_i16_m128i(a, b).into(); assert_eq!(c, [1_i16, -2, -3, 0, 5, 6, 0, 8]); } #[test] fn test_sign_apply_i32_m128i() { let a = m128i::from([1, 2, -3, 4]); let b = m128i::from([5, -6, 7, 0]); let c: [i32; 4] = sign_apply_i32_m128i(a, b).into(); assert_eq!(c, [1, -2, -3, 0]); } safe_arch-0.7.1/tests/integration/xor_hash.rs.bak000066400000000000000000000034171445526200400220430ustar00rootroot00000000000000#![cfg(target_feature = "avx")] #![cfg(feature = "bytemuck")] #[cfg_attr(feature = "nightly", feature(test))] use bytemuck; #[allow(unused_must_use)] #[allow(unused_variables)] use safe_arch::*; use std::ops::BitXor; fn poor_rng(x: u64) -> u64 { let x = x ^ 0xDEADBEEFFEEBDAED; (x >> 3) | (x << 1) } fn random_bytes(n: usize) -> Vec { let mut vec = Vec::with_capacity(n); let mut rng = 0; for _ in 0..n { rng = poor_rng(rng); vec.push(rng); } return vec; } fn simple_xor_hash(s: &[u64]) -> u64 { s.iter().copied().fold(0, BitXor::bitxor) } fn simd_xor_hash(s: &[u64]) -> u64 { let mut ret = 0; let (begin, meat, end) = bytemuck::pod_align_to(s); ret ^= begin.iter().copied().fold(0, BitXor::bitxor); let zero: m256i = [0u64; 4].into(); let x: [u64; 4] = meat.iter().copied().fold(zero, BitXor::bitxor).into(); ret ^= x.iter().copied().fold(0, BitXor::bitxor); ret ^= end.iter().copied().fold(0, BitXor::bitxor); return ret; } #[cfg(test)] mod tests { use super::*; #[test] fn test_xor_hash() { let bytes = random_bytes(1024 * 1024 * 16 + 1); let a = simple_xor_hash(&bytes); let b = simd_xor_hash(&bytes); assert_eq!(a, b); } } #[cfg(feature = "nightly")] #[cfg(test)] mod benches { const BYTES: usize = 1 << 20; extern crate test; use super::*; use test::{black_box, Bencher}; #[bench] fn bench_xor_hash_simd(b: &mut Bencher) { let bytes = random_bytes(BYTES); b.iter(|| { let mut x = black_box(0); x ^= simd_xor_hash(&bytes); let _n = black_box(x); }); } #[bench] fn bench_xor_hash_simple(b: &mut Bencher) { let bytes = random_bytes(BYTES); b.iter(|| { let mut x = black_box(0); x ^= simple_xor_hash(&bytes); let _n = black_box(x); }); } } safe_arch-0.7.1/tmp.txt000066400000000000000000000036041445526200400150000ustar00rootroot00000000000000vabsq_f32 vabsq_f64 vabsq_s16 vabsq_s32 vabsq_s8 vaddq_f32 vaddq_f64 vaddq_s16 vaddq_s32 vaddq_s64 vaddq_s8 vaddq_u16 vaddq_u64 vaddq_u8 vaddvq_u32 vandq_s16 vandq_s32 vandq_s64 vandq_s8 vandq_u16 vandq_u32 vandq_u64 vandq_u8 vbslq_s16 vbslq_s32 vbslq_s64 vbslq_s8 vbslq_u16 vbslq_u32 vbslq_u64 vbslq_u8 vceqq_f32 vceqq_f64 vceqq_s16 vceqq_s32 vceqq_s64 vceqq_s8 vceqq_u16 vceqq_u32 vceqq_u64 vceqq_u8 vcgeq_f32 vcgeq_f64 vcgtq_f32 vcgtq_f64 vcgtq_s16 vcgtq_s32 vcgtq_s64 vcgtq_s8 vcgtq_u64 vcleq_f32 vcleq_f64 vcltq_f32 vcltq_f64 vcltq_s16 vcltq_s32 vcltq_s64 vcltq_s8 vcltq_u32 vcvtnq_s32_f32 vcvtq_f32_s32 vcvtq_s32_f32 vdivq_f32 vdivq_f64 vdupq_n_f32 vdupq_n_f64 vdupq_n_s64 vdupq_n_u32 vdupq_n_u8 veorq_s16 veorq_s32 veorq_s64 veorq_s8 veorq_u16 veorq_u32 veorq_u64 veorq_u8 vgetq_lane_f64 vgetq_lane_s64 vgetq_lane_u64 vld1q_s32 vmaxnmq_f32 vmaxnmq_f64 vmaxq_f32 vmaxq_f64 vmaxq_s16 vmaxq_s8 vmaxq_u16 vmaxq_u32 vmaxq_u8 vminnmq_f32 vminnmq_f64 vminq_f32 vminq_f64 vminq_s16 vminq_s32 vminq_s8 vminq_u16 vminq_u32 vminq_u8 vminvq_u16 vminvq_u32 vminvq_u8 vmovq_n_s16 vmovq_n_s32 vmovq_n_s64 vmulq_f32 vmulq_f64 vmulq_s16 vmulq_s32 vmulq_u16 vmulq_u32 vmvnq_u32 vorrq_s16 vorrq_s32 vorrq_s64 vorrq_s8 vorrq_u16 vorrq_u32 vorrq_u64 vorrq_u8 vqaddq_s16 vqaddq_s8 vqaddq_u16 vqaddq_u8 vqrdmulhq_n_s16 vqrdmulhq_s16 vqsubq_s16 vqsubq_s8 vqsubq_u16 vqsubq_u8 vreinterpretq_f32_s32 vreinterpretq_f32_u32 vreinterpretq_f64_u32 vreinterpretq_f64_u64 vreinterpretq_s16_u16 vreinterpretq_s32_f32 vreinterpretq_s32_s16 vreinterpretq_s32_u32 vreinterpretq_s64_u64 vreinterpretq_s8_u8 vreinterpretq_u16_s16 vreinterpretq_u32_f32 vreinterpretq_u32_s32 vreinterpretq_u64_f64 vreinterpretq_u64_s64 vreinterpretq_u8_s8 vrndnq_f32 vshlq_s16 vshlq_s32 vshlq_s64 vshlq_u16 vshlq_u32 vshlq_u64 vshlq_u8 vsqrtq_f32 vsqrtq_f64 vsubq_f32 vsubq_f64 vsubq_s16 vsubq_s32 vsubq_s64 vsubq_s8 vsubq_u16 vsubq_u64 vsubq_u8 vtrnq_s16 vtrnq_s32 vtrq32