poly1305-0.8.0/.cargo_vcs_info.json0000644000000001460000000000100123360ustar { "git": { "sha1": "47a72962e8898fc70ab79d4ab40949efe61a23d2" }, "path_in_vcs": "poly1305" }poly1305-0.8.0/CHANGELOG.md000064400000000000000000000067310072674642500127750ustar 00000000000000# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## 0.8.0 (2022-07-31) ### Changed - Relax `zeroize` constraints ([#147]) - Upgrade to Rust 2021 edition ([#147]) - Use stable `aarch64_target_feature` ([#154]) - Bump `universal-hash` to v0.5 ([#155], [#162]) - Replace `armv8`/`force-soft` features with `cfg` attributes ([#159]) ### Removed - `armv8`/`force-soft` features ([#159]) [#147]: https://github.com/RustCrypto/universal-hashes/pull/147 [#154]: https://github.com/RustCrypto/universal-hashes/pull/154 [#155]: https://github.com/RustCrypto/universal-hashes/pull/155 [#159]: https://github.com/RustCrypto/universal-hashes/pull/159 [#162]: https://github.com/RustCrypto/universal-hashes/pull/162 ## 0.7.2 (2021-08-27) ### Changed - Bump `cpufeatures` dependency to v0.2 ([#136]) [#136]: https://github.com/RustCrypto/universal-hashes/pull/136 ## 0.7.1 (2021-07-20) ### Changed - Pin `zeroize` dependency to v1.3 ([#134]) [#134]: https://github.com/RustCrypto/universal-hashes/pull/134 ## 0.7.0 (2021-04-29) ### Changed - Use `ManuallyDrop` unions; MSRV 1.49+ ([#114]) - Use `cpufeatures` v0.1 crate release ([#116]) [#114]: https://github.com/RustCrypto/universal-hashes/pull/114 [#116]: https://github.com/RustCrypto/universal-hashes/pull/116 ## 0.6.2 (2020-12-09) ### Added - Runtime AVX2 detection ([#97]) [#97]: https://github.com/RustCrypto/universal-hashes/pull/97 ## 0.6.1 (2020-09-29) ### Added - AVX2 backend ([#49]) [#49]: https://github.com/RustCrypto/universal-hashes/pull/49 ## 0.6.0 (2020-06-06) ### Added - `Poly1305::compute_unpadded` for XSalsa20Poly1305 ([#55]) ### Changed - Bump `universal-hash` dependency to v0.4; MSRV 1.41 ([#52], [#57]) - Rename `result` methods to to `finalize` ([#56]) ### Fixed - Build with `zeroize` enabled ([#48]) [#57]: https://github.com/RustCrypto/universal-hashes/pull/57 [#56]: https://github.com/RustCrypto/universal-hashes/pull/56 [#55]: https://github.com/RustCrypto/universal-hashes/pull/55 [#52]: https://github.com/RustCrypto/universal-hashes/pull/52 [#48]: https://github.com/RustCrypto/universal-hashes/pull/48 ## 0.5.2 (2019-11-14) ### Changed - Upgrade to `zeroize` 1.0 ([#33]) [#33]: https://github.com/RustCrypto/universal-hashes/pull/33 ## 0.5.1 (2019-10-04) ### Added - Link to `chacha20poly1305` and `xsalsa20poly1305` crates from README.md ([#26]) [#26]: https://github.com/RustCrypto/universal-hashes/pull/26 ## 0.5.0 (2019-10-04) ### Changed - Upgrade to `universal-hash` crate v0.3 ([#22]) [#22]: https://github.com/RustCrypto/universal-hashes/pull/22 ## 0.4.1 (2019-10-01) ### Changed - Upgrade to `zeroize` v1.0.0-pre ([#19]) [#19]: https://github.com/RustCrypto/universal-hashes/pull/19 ## 0.4.0 (2019-09-29) ### Changed - Update to Rust 2018 edition ([#3]) - Use `UniversalHash` trait ([#5]) [#3]: https://github.com/RustCrypto/universal-hashes/pull/3 [#5]: https://github.com/RustCrypto/universal-hashes/pull/5 ## 0.3.0 (2019-08-26) ### Changed - Switch from `MacResult` to built-in `Tag` type ([RustCrypto/MACs#13]) [RustCrypto/MACs#13]: https://github.com/RustCrypto/MACs/pull/13 ## 0.2.0 (2019-08-19) ### Added - `Poly1305::input_padded()` ([#16]) ### Changed - Change output to be a `MacResult` ([RustCrypto/MACs#16]) [RustCrypto/MACs#16]: https://github.com/RustCrypto/MACs/pull/16 ## 0.1.0 (2019-08-15) - Initial release poly1305-0.8.0/Cargo.toml0000644000000025530000000000100103400ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.56" name = "poly1305" version = "0.8.0" authors = ["RustCrypto Developers"] description = "The Poly1305 universal hash function and message authentication code" documentation = "https://docs.rs/poly1305" readme = "README.md" keywords = [ "crypto", "chacha20", "mac", "salsa20", "universal-hashing", ] categories = [ "cryptography", "no-std", ] license = "Apache-2.0 OR MIT" repository = "https://github.com/RustCrypto/universal-hashes" resolver = "2" [dependencies.opaque-debug] version = "0.3" [dependencies.universal-hash] version = "0.5" default-features = false [dependencies.zeroize] version = "1" optional = true default-features = false [dev-dependencies.hex-literal] version = "0.3" [features] std = ["universal-hash/std"] [target."cfg(any(target_arch = \"x86_64\", target_arch = \"x86\"))".dependencies.cpufeatures] version = "0.2" poly1305-0.8.0/Cargo.toml.orig000064400000000000000000000014720072674642500140500ustar 00000000000000[package] name = "poly1305" version = "0.8.0" authors = ["RustCrypto Developers"] license = "Apache-2.0 OR MIT" description = "The Poly1305 universal hash function and message authentication code" documentation = "https://docs.rs/poly1305" repository = "https://github.com/RustCrypto/universal-hashes" keywords = ["crypto", "chacha20", "mac", "salsa20", "universal-hashing"] categories = ["cryptography", "no-std"] readme = "README.md" rust-version = "1.56" edition = "2021" [dependencies] opaque-debug = "0.3" universal-hash = { version = "0.5", default-features = false } zeroize = { version = "1", optional = true, default-features = false } [target.'cfg(any(target_arch = "x86_64", target_arch = "x86"))'.dependencies] cpufeatures = "0.2" [dev-dependencies] hex-literal = "0.3" [features] std = ["universal-hash/std"] poly1305-0.8.0/LICENSE-APACHE000064400000000000000000000251410072674642500131040ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. poly1305-0.8.0/LICENSE-MIT000064400000000000000000000020560072674642500126140ustar 00000000000000Copyright (c) 2015-2019 RustCrypto Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. poly1305-0.8.0/README.md000064400000000000000000000054240072674642500124410ustar 00000000000000# RustCrypto: Poly1305 [![crate][crate-image]][crate-link] [![Docs][docs-image]][docs-link] ![Apache2/MIT licensed][license-image] ![Rust Version][rustc-image] [![Build Status][build-image]][build-link] [Poly1305][1] is a [universal hash function][2] which, when combined with a cipher, can be used as a [Message Authentication Code (MAC)][3]. In practice, Poly1305 is primarily combined with ciphers from the [Salsa20 Family][4] such as in [ChaCha20Poly1305][5] and [XSalsa20Poly1305][6] (a.k.a. NaCl `crypto_secretbox`). [Documentation][docs-link] ## Security Notes This crate has received one [security audit by NCC Group][7], with no significant findings. We would like to thank [MobileCoin][8] for funding the audit. NOTE: the audit predates the AVX2 backend, which has not yet been audited. All implementations contained in the crate are designed to execute in constant time, either by relying on hardware intrinsics (e.g. AVX2 on x86/x86_64), or using a portable implementation which is only constant time on processors which implement constant-time multiplication. It is not suitable for use on processors with a variable-time multiplication operation (e.g. short circuit on multiply-by-zero / multiply-by-one, such as certain 32-bit PowerPC CPUs and some non-ARM microcontrollers). ## License Licensed under either of: * [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0) * [MIT license](http://opensource.org/licenses/MIT) at your option. ### Contribution Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in the work by you, as defined in the Apache-2.0 license, shall be dual licensed as above, without any additional terms or conditions. [//]: # (badges) [crate-image]: https://img.shields.io/crates/v/poly1305.svg [crate-link]: https://crates.io/crates/poly1305 [docs-image]: https://docs.rs/poly1305/badge.svg [docs-link]: https://docs.rs/poly1305/ [license-image]: https://img.shields.io/badge/license-Apache2.0/MIT-blue.svg [rustc-image]: https://img.shields.io/badge/rustc-1.56+-blue.svg [build-image]: https://github.com/RustCrypto/universal-hashes/workflows/poly1305/badge.svg?branch=master&event=push [build-link]: https://github.com/RustCrypto/universal-hashes/actions?query=workflow%3Apoly1305 [//]: # (footnotes) [1]: https://en.wikipedia.org/wiki/Poly1305 [2]: https://en.wikipedia.org/wiki/Universal_hashing [3]: https://en.wikipedia.org/wiki/Message_authentication_code [4]: https://cr.yp.to/snuffle/salsafamily-20071225.pdf [5]: https://github.com/RustCrypto/AEADs/tree/master/chacha20poly1305 [6]: https://github.com/RustCrypto/AEADs/tree/master/xsalsa20poly1305 [7]: https://research.nccgroup.com/2020/02/26/public-report-rustcrypto-aes-gcm-and-chacha20poly1305-implementation-review/ [8]: https://www.mobilecoin.com/ poly1305-0.8.0/benches/poly1305.rs000064400000000000000000000012230072674642500144240ustar 00000000000000#![feature(test)] extern crate test; use poly1305::{ universal_hash::{KeyInit, UniversalHash}, Poly1305, }; use test::Bencher; // TODO(tarcieri): move this into the `universal-hash` crate macro_rules! bench { ($name:ident, $bs:expr) => { #[bench] fn $name(b: &mut Bencher) { let key = Default::default(); let mut m = Poly1305::new(&key); let data = [0; $bs]; b.iter(|| { m.update_padded(&data); }); b.bytes = $bs; } }; } bench!(bench1_10, 10); bench!(bench2_100, 100); bench!(bench3_1000, 1000); bench!(bench3_10000, 10000); poly1305-0.8.0/fuzz/main.rs000064400000000000000000000003530072674642500134460ustar 00000000000000#[macro_use] extern crate afl; fn main() { fuzz!(|data: &[u8]| { // Use first 32 bytes of data as key. if data.len() >= 32 { poly1305::fuzz_avx2((&data[0..32]).into(), &data[32..]); } }); } poly1305-0.8.0/src/backend/autodetect.rs000064400000000000000000000052240072674642500160450ustar 00000000000000//! Autodetection support for AVX2 CPU intrinsics on x86 CPUs, with fallback //! to the "soft" backend when it's unavailable. use universal_hash::{consts::U16, crypto_common::BlockSizeUser, UniversalHash}; use crate::{backend, Block, Key, Tag}; use core::mem::ManuallyDrop; cpufeatures::new!(avx2_cpuid, "avx2"); pub struct State { inner: Inner, token: avx2_cpuid::InitToken, } union Inner { avx2: ManuallyDrop, soft: ManuallyDrop, } impl BlockSizeUser for State { type BlockSize = U16; } impl State { /// Initialize Poly1305 [`State`] with the given key #[inline] pub(crate) fn new(key: &Key) -> State { let (token, avx2_present) = avx2_cpuid::init_get(); let inner = if avx2_present { Inner { avx2: ManuallyDrop::new(backend::avx2::State::new(key)), } } else { Inner { soft: ManuallyDrop::new(backend::soft::State::new(key)), } }; Self { inner, token } } /// Compute a Poly1305 block #[inline] pub(crate) fn compute_block(&mut self, block: &Block, partial: bool) { if self.token.get() { unsafe { (*self.inner.avx2).compute_block(block, partial) } } else { unsafe { (*self.inner.soft).compute_block(block, partial) } } } } impl UniversalHash for State { fn update_with_backend( &mut self, f: impl universal_hash::UhfClosure, ) { if self.token.get() { unsafe { f.call(&mut *self.inner.avx2) } } else { unsafe { f.call(&mut *self.inner.soft) } } } /// Finalize output producing a [`Tag`] #[inline] fn finalize(mut self) -> Tag { if self.token.get() { unsafe { (*self.inner.avx2).finalize() } } else { unsafe { (*self.inner.soft).finalize_mut() } } } } impl Clone for State { fn clone(&self) -> Self { let inner = if self.token.get() { Inner { avx2: ManuallyDrop::new(unsafe { (*self.inner.avx2).clone() }), } } else { Inner { soft: ManuallyDrop::new(unsafe { (*self.inner.soft).clone() }), } }; Self { inner, token: self.token, } } } #[cfg(feature = "zeroize")] impl Drop for State { fn drop(&mut self) { use zeroize::Zeroize; const SIZE: usize = core::mem::size_of::(); let state = unsafe { &mut *(self as *mut State as *mut [u8; SIZE]) }; state.zeroize(); } } poly1305-0.8.0/src/backend/avx2/helpers.rs000064400000000000000000002513360072674642500162350ustar 00000000000000//! AVX2 helpers for implementing Poly1305 using 26-bit limbs. use core::fmt; use core::ops::{Add, Mul}; #[cfg(target_arch = "x86")] use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; use super::ParBlocks; use crate::{Block, Key}; const fn set02(x3: u8, x2: u8, x1: u8, x0: u8) -> i32 { (((x3) << 6) | ((x2) << 4) | ((x1) << 2) | (x0)) as i32 } /// Helper for Display impls of aligned values. fn write_130(f: &mut fmt::Formatter<'_>, limbs: [u32; 5]) -> fmt::Result { let r0 = limbs[0] as u128; let r1 = limbs[1] as u128; let r2 = limbs[2] as u128; let r3 = limbs[3] as u128; let r4 = limbs[4] as u128; // Reduce into two u128s let l0 = r0 + (r1 << 26) + (r2 << 52) + (r3 << 78); let (l0, c) = l0.overflowing_add(r4 << 104); let l1 = (r4 >> 24) + if c { 1 } else { 0 }; write!(f, "0x{:02x}{:032x}", l1, l0) } /// Helper for Display impls of unreduced values. fn write_130_wide(f: &mut fmt::Formatter<'_>, limbs: [u64; 5]) -> fmt::Result { let r0 = limbs[0] as u128; let r1 = limbs[1] as u128; let r2 = limbs[2] as u128; let r3 = limbs[3] as u128; let r4 = limbs[4] as u128; // Reduce into two u128s let l0 = r0 + (r1 << 26) + (r2 << 52); let (l0, c1) = l0.overflowing_add(r3 << 78); let (l0, c2) = l0.overflowing_add(r4 << 104); let l1 = (r3 >> 50) + (r4 >> 24) + if c1 { 1 } else { 0 } + if c2 { 1 } else { 0 }; write!(f, "0x{:02x}{:032x}", l1, l0) } /// Derives the Poly1305 addition and polynomial keys. #[target_feature(enable = "avx2")] pub(super) unsafe fn prepare_keys(key: &Key) -> (AdditionKey, PrecomputedMultiplier) { // [k7, k6, k5, k4, k3, k2, k1, k0] let key = _mm256_loadu_si256(key.as_ptr() as *const _); // Prepare addition key: [0, k7, 0, k6, 0, k5, 0, k4] let k = AdditionKey(_mm256_and_si256( _mm256_permutevar8x32_epi32(key, _mm256_set_epi32(3, 7, 2, 6, 1, 5, 0, 4)), _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1), )); // Prepare polynomial key R = k & 0xffffffc0ffffffc0ffffffc0fffffff: let r = Aligned130::new(_mm256_and_si256( key, _mm256_set_epi32(0, 0, 0, 0, 0x0ffffffc, 0x0ffffffc, 0x0ffffffc, 0x0fffffff), )); (k, r.into()) } /// A 130-bit integer aligned across five 26-bit limbs. /// /// The top three 32-bit words of the underlying 256-bit vector are ignored. #[derive(Clone, Copy, Debug)] pub(super) struct Aligned130(pub(super) __m256i); impl fmt::Display for Aligned130 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut v0 = [0u8; 32]; unsafe { _mm256_storeu_si256(v0.as_mut_ptr() as *mut _, self.0); } write!(f, "Aligned130(")?; write_130( f, [ u32::from_le_bytes(v0[0..4].try_into().unwrap()), u32::from_le_bytes(v0[4..8].try_into().unwrap()), u32::from_le_bytes(v0[8..12].try_into().unwrap()), u32::from_le_bytes(v0[12..16].try_into().unwrap()), u32::from_le_bytes(v0[16..20].try_into().unwrap()), ], )?; write!(f, ")") } } impl Aligned130 { /// Aligns a 16-byte Poly1305 block at 26-bit boundaries within 32-bit words, and sets /// the high bit. #[target_feature(enable = "avx2")] pub(super) unsafe fn from_block(block: &Block) -> Self { Aligned130::new(_mm256_or_si256( _mm256_and_si256( // Load the 128-bit block into a 256-bit vector. _mm256_castsi128_si256(_mm_loadu_si128(block.as_ptr() as *const _)), // Mask off the upper 128 bits (undefined by _mm256_castsi128_si256). _mm256_set_epi64x(0, 0, -1, -1), ), // Set the high bit. _mm256_set_epi64x(0, 1, 0, 0), )) } /// Aligns a partial Poly1305 block at 26-bit boundaries within 32-bit words. /// /// Assumes that the high bit is already correctly set for the partial block. #[target_feature(enable = "avx2")] pub(super) unsafe fn from_partial_block(block: &Block) -> Self { Aligned130::new(_mm256_and_si256( // Load the 128-bit block into a 256-bit vector. _mm256_castsi128_si256(_mm_loadu_si128(block.as_ptr() as *const _)), // Mask off the upper 128 bits (undefined by _mm256_castsi128_si256). _mm256_set_epi64x(0, 0, -1, -1), )) } /// Splits a 130-bit integer into five 26-bit limbs. #[target_feature(enable = "avx2")] unsafe fn new(x: __m256i) -> Self { // Starting from a 130-bit integer split across 32-bit words: // [0, 0, 0, [0; 30] || x4[2..0], x3, x2, x1, x0] // - Grab the low bits of each word: // x1 = [ // [0; 32], // [0; 32], // [0; 32], // [0; 6] || x4[ 2..0] || [0; 24], // x3[14..0] || [0; 18], // x2[20..0] || [0; 12], // x1[26..0] || [0; 6], // x0, // ] let xl = _mm256_sllv_epi32(x, _mm256_set_epi32(32, 32, 32, 24, 18, 12, 6, 0)); // Grab the high bits of each word, rotated up by one word: // xh = [ // [0; 32], // [0; 32], // [0; 32], // [0; 8] || x3[32.. 8] // [0; 14] || x2[32..14] // [0; 20] || x1[32..20] // [0; 26] || x0[32..26], // [0; 32], // ] let xh = _mm256_permutevar8x32_epi32( _mm256_srlv_epi32(x, _mm256_set_epi32(32, 32, 32, 2, 8, 14, 20, 26)), _mm256_set_epi32(6, 5, 4, 3, 2, 1, 0, 7), ); // - Combine the low and high bits: // [ // [0; 32], // [0; 32], // [0; 32], // [0; 6] || x4[ 2..0] || x3[32.. 8] // x3[14..0] || x2[32..14] // x2[20..0] || x1[32..20] // x1[26..0] || x0[32..26], // x0, // ] // - Mask to 26 bits: // [ // [0; 32], // [0; 32], // [0; 32], // [0; 6] || x4[ 2..0] || x3[32.. 8] // [0; 6] || x3[ 8..0] || x2[32..14] // [0; 6] || x2[14..0] || x1[32..20] // [0; 6] || x1[20..0] || x0[32..26], // [0; 6] || x0[26..0], // ] Aligned130(_mm256_and_si256( _mm256_or_si256(xl, xh), _mm256_set_epi32( 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, ), )) } } impl Add for Aligned130 { type Output = Aligned130; fn add(self, other: Aligned130) -> Aligned130 { // With 26-bit limbs inside 32-bit words, there is plenty of space for unreduced // addition. unsafe { Aligned130(_mm256_add_epi32(self.0, other.0)) } } } /// A pre-computed multiplier. #[derive(Clone, Copy, Debug)] pub(super) struct PrecomputedMultiplier { pub(super) a: __m256i, pub(super) a_5: __m256i, } impl From for PrecomputedMultiplier { fn from(r: Aligned130) -> Self { unsafe { // Precompute 5*R. // // The 5-limb representation (r_4, r_3, r_2, r_1, r_0) of R and // (5·r4, 5·r3, 5·r2, 5·r1) are represented in two 256-bit vectors in the // following manner: // r1: [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0] // r1_5: [5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1] let a_5 = _mm256_permutevar8x32_epi32( _mm256_add_epi32(r.0, _mm256_slli_epi32(r.0, 2)), _mm256_set_epi32(4, 3, 2, 1, 1, 1, 1, 1), ); let a = _mm256_blend_epi32(r.0, a_5, 0b11100000); let a_5 = _mm256_permute2x128_si256(a_5, a_5, 0); PrecomputedMultiplier { a, a_5 } } } } impl Mul for PrecomputedMultiplier { type Output = Unreduced130; fn mul(self, other: PrecomputedMultiplier) -> Unreduced130 { // Pass through to `self.a` for multiplication. Aligned130(self.a) * other } } impl Mul for Aligned130 { type Output = Unreduced130; /// Multiplies 2 values using lazy reduction. /// /// Context switches from 32 bit to 64 bit. #[inline(always)] fn mul(self, other: PrecomputedMultiplier) -> Unreduced130 { unsafe { // Starting with the following limb layout: // x = [ 0, 0, 0, x_4, x_3, x_2, x_1, x_0] // y = [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0] // z = [5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1] let x = self.0; let y = other.a; let z = other.a_5; // [ 0, x_4, 0, x_3, 0, x_2, 0, x_1] (32-bit words) // * 5·r_4 // = [5·r_4·x_4, 5·r_4·x_3, 5·r_4·x_2, 5·r_4·x_1] (64-bit words) let v0 = _mm256_mul_epu32( _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(4, 3, 2, 1)), _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(7, 7, 7, 7)), ); // [ 0, x_3, 0, x_2, 0, x_1, 0, x_0] (32-bit words) // * r_0 // = [ r_0·x_3, r_0·x_2, r_0·x_1, r_0·x_0] (64-bit words) // + previous step // = [ // r_0·x_3 + 5·r_4·x_4, // r_0·x_2 + 5·r_4·x_3, // r_0·x_1 + 5·r_4·x_2, // r_0·x_0 + 5·r_4·x_1, // ] let v0 = _mm256_add_epi64( v0, _mm256_mul_epu32( _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(3, 2, 1, 0)), _mm256_broadcastd_epi32(_mm256_castsi256_si128(y)), ), ); // [ 0, x_1, 0, x_1, 0, x_3, 0, x_3] // * [ 0, r_2, 0, r_1, 0, 5·r_3, 0, 5·r_2] // = [r_2·x_1, r_1·x_1, 5·r_3·x_3, 5·r_2·x_3] // + previous step // = [ // r_0·x_3 + r_2·x_1 + 5·r_4·x_4, // r_0·x_2 + r_1·x_1 + 5·r_4·x_3, // r_0·x_1 + 5·r_3·x_3 + 5·r_4·x_2, // r_0·x_0 + 5·r_2·x_3 + 5·r_4·x_1, // ] let v0 = _mm256_add_epi64( v0, _mm256_mul_epu32( _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(1, 1, 3, 3)), _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(2, 1, 6, 5)), ), ); // [x_3, x_2, x_1, x_0, x_1, x_0, 0, x_4] // * [ 0, r_1, 0, r_2, 0, r_1, 5·r_1, 5·r_1] // = [ r_1·x_2, r_2·x_0, r_1·x_0, 5·r_1·x_4] // + previous step // = [ // r_0·x_3 + r_1·x_2 + r_2·x_1 + 5·r_4·x_4, // r_0·x_2 + r_1·x_1 + r_2·x_0 + 5·r_4·x_3, // r_0·x_1 + r_1·x_0 + 5·r_3·x_3 + 5·r_4·x_2, // r_0·x_0 + 5·r_1·x_4 + 5·r_2·x_3 + 5·r_4·x_1, // ] let v0 = _mm256_add_epi64( v0, _mm256_mul_epu32( _mm256_permute4x64_epi64(x, set02(1, 0, 0, 2)), _mm256_blend_epi32( _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(1, 2, 1, 1)), z, 0x03, ), ), ); // [x_1, x_0, 0, x_4, 0, x_4, x_3, x_2] // * [ 0, r_3, 0, 5·r_3, 0, 5·r_2, 0, 5·r_3] // = [ r_3·x_0, 5·r_3·x_4, 5·r_2·x_4, 5·r_3·x_2] // + previous step // v0 = [ // r_0·x_3 + r_1·x_2 + r_2·x_1 + r_3·x_0 + 5·r_4·x_4, // r_0·x_2 + r_1·x_1 + r_2·x_0 + 5·r_3·x_4 + 5·r_4·x_3, // r_0·x_1 + r_1·x_0 + 5·r_2·x_4 + 5·r_3·x_3 + 5·r_4·x_2, // r_0·x_0 + 5·r_1·x_4 + 5·r_2·x_3 + 5·r_3·x_2 + 5·r_4·x_1, // ] let v0 = _mm256_add_epi64( v0, _mm256_mul_epu32( _mm256_permute4x64_epi64(x, set02(0, 2, 2, 1)), _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(3, 6, 5, 6)), ), ); // [ 0, x_3, 0, x_2, 0, x_1, 0, x_0] // * [ 0, r_1, 0, r_2, 0, r_3, 0, r_4] // = [r_1·x_3, r_2·x_2, r_3·x_1, r_4·x_0] let v1 = _mm256_mul_epu32( _mm256_permutevar8x32_epi32(x, _mm256_set_epi64x(3, 2, 1, 0)), _mm256_permutevar8x32_epi32(y, _mm256_set_epi64x(1, 2, 3, 4)), ); // [r_3·x_1, r_4·x_0, r_1·x_3, r_2·x_2] // + previous step // = [ // r_1·x_3 + r_3·x_1, // r_2·x_2 + r_4·x_0, // r_1·x_3 + r_3·x_1, // r_2·x_2 + r_4·x_0, // ] let v1 = _mm256_add_epi64(v1, _mm256_permute4x64_epi64(v1, set02(1, 0, 3, 2))); // [ // r_2·x_2 + r_4·x_0, // r_2·x_2 + r_4·x_0, // r_2·x_2 + r_4·x_0, // r_1·x_3 + r_3·x_1, // ] // + previous step // = [ // r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0, // 2·r_2·x_2 + 2·r_4·x_0, // r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0, // r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0, // ] let v1 = _mm256_add_epi64(v1, _mm256_permute4x64_epi64(v1, set02(0, 0, 0, 1))); // [ x_1, x_0, x_1, x_0, x_1, x_0, 0, x_4] // * [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0] // = [ 5·r_3·x_0, r_4·x_0, r_2·x_0, r_0·x_4] // + previous step // v1 = [ // 5·r_3·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0, // 2·r_2·x_2 + 3·r_4·x_0, // r_2·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0, // r_0·x_4 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0, // ] let v1 = _mm256_add_epi64( v1, _mm256_mul_epu32(_mm256_permute4x64_epi64(x, set02(0, 0, 0, 2)), y), ); // The result: // v1 = [ // 5·r_3·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0, // 2·r_2·x_2 + 3·r_4·x_0, // r_2·x_0 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0, // r_0·x_4 + r_1·x_3 + r_2·x_2 + r_3·x_1 + r_4·x_0, // ] // v0 = [ // r_0·x_3 + r_1·x_2 + r_2·x_1 + r_3·x_0 + 5·r_4·x_4, // r_0·x_2 + r_1·x_1 + r_2·x_0 + 5·r_3·x_4 + 5·r_4·x_3, // r_0·x_1 + r_1·x_0 + 5·r_2·x_4 + 5·r_3·x_3 + 5·r_4·x_2, // r_0·x_0 + 5·r_1·x_4 + 5·r_2·x_3 + 5·r_3·x_2 + 5·r_4·x_1, // ] // This corresponds to (3) in Goll Gueron 2015: // v1 = [ _, _, _, t_4] // v0 = [t_3, t_2, t_1, t_0] Unreduced130 { v0, v1 } } } } /// The unreduced output of an `Aligned130` multiplication. /// /// Represented internally with 64-bit limbs. #[derive(Copy, Clone, Debug)] pub(super) struct Unreduced130 { v0: __m256i, v1: __m256i, } impl fmt::Display for Unreduced130 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut v0 = [0u8; 32]; let mut v1 = [0u8; 32]; unsafe { _mm256_storeu_si256(v0.as_mut_ptr() as *mut _, self.v0); _mm256_storeu_si256(v1.as_mut_ptr() as *mut _, self.v1); } write!(f, "Unreduced130(")?; write_130_wide( f, [ u64::from_le_bytes(v0[0..8].try_into().unwrap()), u64::from_le_bytes(v0[8..16].try_into().unwrap()), u64::from_le_bytes(v0[16..24].try_into().unwrap()), u64::from_le_bytes(v0[24..32].try_into().unwrap()), u64::from_le_bytes(v1[0..8].try_into().unwrap()), ], )?; write!(f, ")") } } impl Unreduced130 { /// Reduces x modulo 2^130 - 5. /// /// Context switches from 64 bit to 32 bit. #[inline(always)] pub(super) fn reduce(self) -> Aligned130 { unsafe { // Starting with the following limb layout: // self.v1 = [ _, _, _, t_4] // self.v0 = [t_3, t_2, t_1, t_0] let (red_1, red_0) = adc(self.v1, self.v0); let (red_1, red_0) = red(red_1, red_0); let (red_1, red_0) = adc(red_1, red_0); // - Switch context from 64-bit limbs to 32-bit limbs: Aligned130(_mm256_blend_epi32( _mm256_permutevar8x32_epi32(red_0, _mm256_set_epi32(0, 6, 4, 0, 6, 4, 2, 0)), _mm256_permutevar8x32_epi32(red_1, _mm256_set_epi32(0, 6, 4, 0, 6, 4, 2, 0)), 0x90, )) } } } /// Carry chain #[inline(always)] unsafe fn adc(v1: __m256i, v0: __m256i) -> (__m256i, __m256i) { // [t_3, t_2 % 2^26, t_1 % 2^26, t_0 % 2^26] // + [t_2 >> 26, t_1 >> 26, t_0 >> 26, 0 ] // = [ // t_3 + t_2 >> 26, // t_2 % 2^26 + t_1 >> 26, // t_1 % 2^26 + t_0 >> 26, // t_0 % 2^26, // ] let v0 = _mm256_add_epi64( _mm256_and_si256(v0, _mm256_set_epi64x(-1, 0x3ffffff, 0x3ffffff, 0x3ffffff)), _mm256_permute4x64_epi64( _mm256_srlv_epi64(v0, _mm256_set_epi64x(64, 26, 26, 26)), set02(2, 1, 0, 3), ), ); // [_, _, _, t_4] // + [ // (t_2 % 2^26 + t_1 >> 26) >> 26, // (t_1 % 2^26 + t_0 >> 26) >> 26, // (t_0 % 2^26 ) >> 26, // (t_3 + t_2 >> 26) >> 26, // ] // = [_, _, _, t_4 + (t_3 + t_2 >> 26) >> 26] let v1 = _mm256_add_epi64( v1, _mm256_permute4x64_epi64(_mm256_srli_epi64(v0, 26), set02(2, 1, 0, 3)), ); // [ // (t_3 + t_2 >> 26) % 2^26, // t_2 % 2^26 + t_1 >> 26, // t_1 % 2^26 + t_0 >> 26, // t_0 % 2^26, // ] let chain = _mm256_and_si256(v0, _mm256_set_epi64x(0x3ffffff, -1, -1, -1)); (v1, chain) } /// Reduction modulus 2^130-5 #[inline(always)] unsafe fn red(v1: __m256i, v0: __m256i) -> (__m256i, __m256i) { // t = [0, 0, 0, t_4 >> 26] let t = _mm256_srlv_epi64(v1, _mm256_set_epi64x(64, 64, 64, 26)); // v0 + 5·t = [t_3, t_2, t_1, t_0 + 5·(t_4 >> 26)] let red_0 = _mm256_add_epi64(_mm256_add_epi64(v0, t), _mm256_slli_epi64(t, 2)); // [0, 0, 0, t_4 % 2^26] let red_1 = _mm256_and_si256(v1, _mm256_set_epi64x(0, 0, 0, 0x3ffffff)); (red_1, red_0) } /// A pair of `Aligned130`s. #[derive(Clone, Debug)] pub(super) struct Aligned2x130 { v0: Aligned130, v1: Aligned130, } impl fmt::Display for Aligned2x130 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "Aligned2x130([")?; writeln!(f, " {},", self.v0)?; writeln!(f, " {},", self.v1)?; write!(f, "])") } } impl Aligned2x130 { /// Aligns two 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and /// sets the high bit for each block. /// /// # Panics /// /// Panics if `src.len() < 32`. #[target_feature(enable = "avx2")] pub(super) unsafe fn from_blocks(src: &[Block; 2]) -> Self { Aligned2x130 { v0: Aligned130::from_block(&src[0]), v1: Aligned130::from_block(&src[1]), } } /// Multiplies 2x2 and add both results simultaneously using lazy reduction. /// /// Context switches from 32 bit to 64 bit. #[inline(always)] pub(super) fn mul_and_sum( self, r1: PrecomputedMultiplier, r2: PrecomputedMultiplier, ) -> Unreduced130 { unsafe { // Starting with the following limb layout: // x.v1 = [ 0, 0, 0, x1_4, x1_3, x1_2, x1_1, x1_0] // x.v0 = [ 0, 0, 0, x0_4, x0_3, x0_2, x0_1, x0_0] // r1 = [5·r1_4, 5·r1_3, 5·r1_2, r1_4, r1_3, r1_2, r1_1, r1_0] // r15 = [5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1, 5·r1_1] // r2 = [5·r2_4, 5·r2_3, 5·r2_2, r2_4, r2_3, r2_2, r2_1, r2_0] // r25 = [5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1, 5·r2_1] let x = self; let r15 = r1.a_5; let r25 = r2.a_5; let r1 = r1.a; let r2 = r2.a; // v0 = [ // 5·x0_4·r2_4, // 5·x0_3·r2_4, // 5·x0_2·r2_4, // 5·x0_1·r2_4, // ] let mut v0 = _mm256_mul_epu32( // [_, x0_4, _, x0_3, _, x0_2, _, x0_1] _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(4, 3, 2, 1)), // [_, 5·r2_4, _, 5·r2_4, _, 5·r2_4, _, 5·r2_4] _mm256_permutevar8x32_epi32(r2, _mm256_set1_epi64x(7)), ); // v1 = [ // 5·x1_4·r1_4, // 5·x1_3·r1_4, // 5·x1_2·r1_4, // 5·x1_1·r1_4, // ] let mut v1 = _mm256_mul_epu32( // [_, x1_4, _, x1_3, _, x1_2, _, x1_1] _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(4, 3, 2, 1)), // [_, 5·r1_4, _, 5·r1_4, _, 5·r1_4, _, 5·r1_4] _mm256_permutevar8x32_epi32(r1, _mm256_set1_epi64x(7)), ); // v0 = [ // `x0_0·r2_3`+ 5·x0_4·r2_4, // `5·x0_4·r2_3`+ 5·x0_3·r2_4, // `5·x0_4·r2_2` + 5·x0_2·r2_4, // `5·x0_2·r2_3`+ 5·x0_1·r2_4, // ] v0 = _mm256_add_epi64( v0, _mm256_mul_epu32( // [_, x0_0, _, x0_4, _, x0_4, _, x0_2] _mm256_permute4x64_epi64(x.v0.0, set02(0, 2, 2, 1)), // [_, r2_3, _, 5·r2_3, _, 5·r2_2, _, 5·r2_3] _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(3, 6, 5, 6)), ), ); // v1 = [ // `x1_0·r1_3`+ 5·x1_4·r1_4, // `5·x1_4·r1_3`+ 5·x1_3·r1_4, // `5·x1_4·r1_2` + 5·x1_2·r1_4, // `5·x1_2·r1_3`+ 5·x1_1·r1_4, // ] v1 = _mm256_add_epi64( v1, _mm256_mul_epu32( // [_, x1_0, _, x1_4, _, x1_4, _, x1_2] _mm256_permute4x64_epi64(x.v1.0, set02(0, 2, 2, 1)), // [_, r1_3, _, 5·r1_3, _, 5·r1_2, _, 5·r1_3] _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(3, 6, 5, 6)), ), ); // v0 = [ // `x0_1·r2_2`+ x0_0·r2_3 + 5·x0_4·r2_4, // `x0_1·r2_1` + 5·x0_4·r2_3 + 5·x0_3·r2_4, // 5·x0_4·r2_2 +`5·x0_3·r2_3`+ 5·x0_2·r2_4, // `5·x0_3·r2_2`+ 5·x0_2·r2_3 + 5·x0_1·r2_4, // ] v0 = _mm256_add_epi64( v0, _mm256_mul_epu32( // [_, x0_1, _, x0_1, _, x0_3, _, x0_3] _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(1, 1, 3, 3)), // [_, r2_2, _, r2_1, _, 5·r2_3, _, 5·r2_2] _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(2, 1, 6, 5)), ), ); // v1 = [ // `x1_1·r1_2`+ x1_0·r1_3 + 5·x1_4·r1_4, // `x1_1·r1_1` + 5·x1_4·r1_3 + 5·x1_3·r1_4, // 5·x1_4·r1_2 +`5·x1_3·r1_3`+ 5·x1_2·r1_4, // `5·x1_3·r1_2`+ 5·x1_2·r1_3 + 5·x1_1·r1_4, // ] v1 = _mm256_add_epi64( v1, _mm256_mul_epu32( // [_, x1_1, _, x1_1, _, x1_3, _, x1_3] _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(1, 1, 3, 3)), // [_, r1_2, _, r1_1, _, 5·r1_3, _, 5·r1_2] _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(2, 1, 6, 5)), ), ); // v0 = [ // `x0_3·r2_0` + x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4, // `x0_2·r2_0`+ x0_1·r2_1 + 5·x0_4·r2_3 + 5·x0_3·r2_4, // `x0_1·r2_0` + 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4, // `x0_0·r2_0` + 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4, // ] v0 = _mm256_add_epi64( v0, _mm256_mul_epu32( // [_, x0_3, _, x0_2, _, x0_1, _, x0_0] _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(3, 2, 1, 0)), // [_, r2_0, _, r2_0, _, r2_0, _, r2_0] _mm256_broadcastd_epi32(_mm256_castsi256_si128(r2)), ), ); // v1 = [ // `x1_3·r1_0` + x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4, // `x1_2·r1_0`+ x1_1·r1_1 + 5·x1_4·r1_3 + 5·x1_3·r1_4, // `x1_1·r1_0` + 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4, // `x1_0·r1_0` + 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4, // ] v1 = _mm256_add_epi64( v1, _mm256_mul_epu32( // [_, x1_3, _, x1_2, _, x1_1, _, x1_0] _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(3, 2, 1, 0)), // [_, r1_0, _, r1_0, _, r1_0, _, r1_0] _mm256_broadcastd_epi32(_mm256_castsi256_si128(r1)), ), ); // t0 = [x0_3, x0_2, x0_1, x0_0, x0_1, x0_0, 0, x0_4] // t1 = [x1_3, x1_2, x1_1, x1_0, x1_1, x1_0, 0, x1_4] let mut t0 = _mm256_permute4x64_epi64(x.v0.0, set02(1, 0, 0, 2)); let mut t1 = _mm256_permute4x64_epi64(x.v1.0, set02(1, 0, 0, 2)); // v0 = [ // x0_3·r2_0 + `x0_2·r2_1`+ x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4, // x0_2·r2_0 + x0_1·r2_1 + `x0_0·r2_2`+ 5·x0_4·r2_3 + 5·x0_3·r2_4, // x0_1·r2_0 + `x0_0·r2_1`+ 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4, // x0_0·r2_0 +`5·x0_4·r2_1`+ 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4, // ] v0 = _mm256_add_epi64( v0, _mm256_mul_epu32( // [_, x0_2, _, x0_0, _, x0_0, _, x0_4] t0, // [_, r2_1, _, r2_2, _, r2_1, _, 5·r2_1] _mm256_blend_epi32( // [r2_0, r2_1, r2_0, r2_2, r2_0, r2_1, r2_0, r2_1] _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(1, 2, 1, 1)), r25, 0b00000011, ), ), ); // v1 = [ // x1_3·r1_0 + `x1_2·r1_1`+ x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4, // x1_2·r1_0 + x1_1·r1_1 + `x1_0·r1_2`+ 5·x1_4·r1_3 + 5·x1_3·r1_4, // x1_1·r1_0 + `x1_0·r1_1`+ 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4, // x1_0·r1_0 +`5·x1_4·r1_1`+ 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4, // ] v1 = _mm256_add_epi64( v1, _mm256_mul_epu32( // [_, x1_2, _, x1_0, _, x1_0, _, x1_4] t1, // [_, r1_1, _, r1_2, _, r1_1, _, 5·r1_1] _mm256_blend_epi32( // [r1_0, r1_1, r1_0, r1_2, r1_0, r1_1, r1_0, r1_1] _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(1, 2, 1, 1)), r15, 0b00000011, ), ), ); // v0 = [ // x0_3·r2_0 + x0_2·r2_1 + x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4 + x1_3·r1_0 + x1_2·r1_1 + x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4, // x0_2·r2_0 + x0_1·r2_1 + x0_0·r2_2 + 5·x0_4·r2_3 + 5·x0_3·r2_4 + x1_2·r1_0 + x1_1·r1_1 + x1_0·r1_2 + 5·x1_4·r1_3 + 5·x1_3·r1_4, // x0_1·r2_0 + x0_0·r2_1 + 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4 + x1_1·r1_0 + x1_0·r1_1 + 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4, // x0_0·r2_0 + 5·x0_4·r2_1 + 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4 + x1_0·r1_0 + 5·x1_4·r1_1 + 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4, // ] v0 = _mm256_add_epi64(v0, v1); // t0 = [ // 5·x0_2·r2_3, // x0_0·r2_4, // x0_0·r2_2, // x0_4·r2_0, // ] // t1 = [ // 5·x1_2·r1_3, // x1_0·r1_4, // x1_0·r1_2, // x1_4·r1_0, // ] t0 = _mm256_mul_epu32(t0, r2); t1 = _mm256_mul_epu32(t1, r1); // v1 = [ // 5·x0_2·r2_3 + 5·x1_2·r1_3, // x0_0·r2_4 + x1_0·r1_4, // x0_0·r2_2 + x1_0·r1_2, // x0_4·r2_0 + x1_4·r1_0, // ] v1 = _mm256_add_epi64(t0, t1); // t0 = [ // x0_3·r2_1, // x0_2·r2_2, // x0_1·r2_3, // x0_0·r2_4, // ] t0 = _mm256_mul_epu32( // [_, x0_3, _, x0_2, _, x0_1, _, x0_0] _mm256_permutevar8x32_epi32(x.v0.0, _mm256_set_epi64x(3, 2, 1, 0)), // [_, r2_1, _, r2_2, _, r2_3, _, r2_4] _mm256_permutevar8x32_epi32(r2, _mm256_set_epi64x(1, 2, 3, 4)), ); // t1 = [ // x1_3·r1_1, // x1_2·r1_2, // x1_1·r1_3, // x1_0·r1_4, // ] t1 = _mm256_mul_epu32( // [_, x1_3, _, x1_2, _, x1_1, _, x1_0] _mm256_permutevar8x32_epi32(x.v1.0, _mm256_set_epi64x(3, 2, 1, 0)), // [_, r1_1, _, r1_2, _, r1_3, _, r1_4] _mm256_permutevar8x32_epi32(r1, _mm256_set_epi64x(1, 2, 3, 4)), ); // t0 = [ // x0_3·r2_1 + x1_3·r1_1, // x0_2·r2_2 + x1_2·r1_2, // x0_1·r2_3 + x1_1·r1_3, // x0_0·r2_4 + x1_0·r1_4, // ] t0 = _mm256_add_epi64(t0, t1); // t0 = [ // x0_3·r2_1 + x0_1·r2_3 + x1_3·r1_1 + x1_1·r1_3, // x0_2·r2_2 + x0_0·r2_4 + x1_2·r1_2 + x1_0·r1_4, // x0_3·r2_1 + x0_1·r2_3 + x1_3·r1_1 + x1_1·r1_3, // x0_2·r2_2 + x0_0·r2_4 + x1_2·r1_2 + x1_0·r1_4, // ] t0 = _mm256_add_epi64(t0, _mm256_permute4x64_epi64(t0, set02(1, 0, 3, 2))); // t0 = [ // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4, // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4, // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4, // x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4, // ] t0 = _mm256_add_epi64(t0, _mm256_permute4x64_epi64(t0, set02(2, 3, 0, 1))); // v1 = [ // 5·x0_2·r2_3 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + 5·x1_2·r1_3 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4, // x0_0·r2_4 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_0·r1_4 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4, // x0_0·r2_2 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_0·r1_2 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4, // x0_4·r2_0 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_4·r1_0 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4, // ] v1 = _mm256_add_epi64(v1, t0); // The result: // v1 = [ // _, _, _, // x0_4·r2_0 + x0_3·r2_1 + x0_2·r2_2 + x0_1·r2_3 + x0_0·r2_4 + x1_4·r1_0 + x1_3·r1_1 + x1_2·r1_2 + x1_1·r1_3 + x1_0·r1_4, // ] // v0 = [ // x0_3·r2_0 + x0_2·r2_1 + x0_1·r2_2 + x0_0·r2_3 + 5·x0_4·r2_4 + x1_3·r1_0 + x1_2·r1_1 + x1_1·r1_2 + x1_0·r1_3 + 5·x1_4·r1_4, // x0_2·r2_0 + x0_1·r2_1 + x0_0·r2_2 + 5·x0_4·r2_3 + 5·x0_3·r2_4 + x1_2·r1_0 + x1_1·r1_1 + x1_0·r1_2 + 5·x1_4·r1_3 + 5·x1_3·r1_4, // x0_1·r2_0 + x0_0·r2_1 + 5·x0_4·r2_2 + 5·x0_3·r2_3 + 5·x0_2·r2_4 + x1_1·r1_0 + x1_0·r1_1 + 5·x1_4·r1_2 + 5·x1_3·r1_3 + 5·x1_2·r1_4, // x0_0·r2_0 + 5·x0_4·r2_1 + 5·x0_3·r2_2 + 5·x0_2·r2_3 + 5·x0_1·r2_4 + x1_0·r1_0 + 5·x1_4·r1_1 + 5·x1_3·r1_2 + 5·x1_2·r1_3 + 5·x1_1·r1_4, // ] Unreduced130 { v0, v1 } } } } impl Add for Aligned2x130 { type Output = Aligned2x130; /// Adds `other` into the lower integer of `self`. fn add(self, other: Aligned130) -> Aligned2x130 { Aligned2x130 { v0: self.v0 + other, v1: self.v1, } } } /// A multiplier that takes 130-bit integers `(x3, x2, x1, x0)` and computes /// `(x3·R^4, x2·R^3, x1·R^2, x0·R) mod 2^130 - 5`. #[derive(Copy, Clone, Debug)] pub(super) struct SpacedMultiplier4x130 { v0: __m256i, v1: __m256i, r1: PrecomputedMultiplier, } impl SpacedMultiplier4x130 { /// Returns `(multipler, R^4)` given `(R^1, R^2)`. #[target_feature(enable = "avx2")] pub(super) unsafe fn new( r1: PrecomputedMultiplier, r2: PrecomputedMultiplier, ) -> (Self, PrecomputedMultiplier) { let r3 = (r2 * r1).reduce(); let r4 = (r2 * r2).reduce(); // v0 = [r2_4, r2_3, r2_1, r3_4, r3_3, r3_2, r3_1, r3_0] let v0 = _mm256_blend_epi32( r3.0, _mm256_permutevar8x32_epi32(r2.a, _mm256_set_epi32(4, 3, 1, 0, 0, 0, 0, 0)), 0b11100000, ); // v1 = [r2_4, r2_2, r2_0, r4_4, r4_3, r4_2, r4_1, r4_0] let v1 = _mm256_blend_epi32( r4.0, _mm256_permutevar8x32_epi32(r2.a, _mm256_set_epi32(4, 2, 0, 0, 0, 0, 0, 0)), 0b11100000, ); let m = SpacedMultiplier4x130 { v0, v1, r1 }; (m, r4.into()) } } /// Four 130-bit integers aligned across five 26-bit limbs each. /// /// Unlike `Aligned2x130` which wraps two `Aligned130`s, this struct represents the four /// integers as 20 limbs spread across three 256-bit vectors. #[derive(Copy, Clone, Debug)] pub(super) struct Aligned4x130 { v0: __m256i, v1: __m256i, v2: __m256i, } impl fmt::Display for Aligned4x130 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut v0 = [0u8; 32]; let mut v1 = [0u8; 32]; let mut v2 = [0u8; 32]; unsafe { _mm256_storeu_si256(v0.as_mut_ptr() as *mut _, self.v0); _mm256_storeu_si256(v1.as_mut_ptr() as *mut _, self.v1); _mm256_storeu_si256(v2.as_mut_ptr() as *mut _, self.v2); } writeln!(f, "Aligned4x130([")?; write!(f, " ")?; write_130( f, [ u32::from_le_bytes(v0[0..4].try_into().unwrap()), u32::from_le_bytes(v1[0..4].try_into().unwrap()), u32::from_le_bytes(v0[4..8].try_into().unwrap()), u32::from_le_bytes(v1[4..8].try_into().unwrap()), u32::from_le_bytes(v2[0..4].try_into().unwrap()), ], )?; writeln!(f, ",")?; write!(f, " ")?; write_130( f, [ u32::from_le_bytes(v0[8..12].try_into().unwrap()), u32::from_le_bytes(v1[8..12].try_into().unwrap()), u32::from_le_bytes(v0[12..16].try_into().unwrap()), u32::from_le_bytes(v1[12..16].try_into().unwrap()), u32::from_le_bytes(v2[8..12].try_into().unwrap()), ], )?; writeln!(f, ",")?; write!(f, " ")?; write_130( f, [ u32::from_le_bytes(v0[16..20].try_into().unwrap()), u32::from_le_bytes(v1[16..20].try_into().unwrap()), u32::from_le_bytes(v0[20..24].try_into().unwrap()), u32::from_le_bytes(v1[20..24].try_into().unwrap()), u32::from_le_bytes(v2[16..20].try_into().unwrap()), ], )?; writeln!(f, ",")?; write!(f, " ")?; write_130( f, [ u32::from_le_bytes(v0[24..28].try_into().unwrap()), u32::from_le_bytes(v1[24..28].try_into().unwrap()), u32::from_le_bytes(v0[28..32].try_into().unwrap()), u32::from_le_bytes(v1[28..32].try_into().unwrap()), u32::from_le_bytes(v2[24..28].try_into().unwrap()), ], )?; writeln!(f, ",")?; write!(f, "])") } } impl Aligned4x130 { /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and /// sets the high bit for each block. /// /// # Panics /// /// Panics if `src.len() < 64`. #[target_feature(enable = "avx2")] pub(super) unsafe fn from_blocks(src: &[Block; 4]) -> Self { let (lo, hi) = src.split_at(2); let blocks_23 = _mm256_loadu_si256(hi.as_ptr() as *const _); let blocks_01 = _mm256_loadu_si256(lo.as_ptr() as *const _); Self::from_loaded_blocks(blocks_01, blocks_23) } /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and /// sets the high bit for each block. #[target_feature(enable = "avx2")] pub(super) unsafe fn from_par_blocks(src: &ParBlocks) -> Self { let (lo, hi) = src.split_at(2); let blocks_23 = _mm256_loadu_si256(hi.as_ptr() as *const _); let blocks_01 = _mm256_loadu_si256(lo.as_ptr() as *const _); Self::from_loaded_blocks(blocks_01, blocks_23) } /// Aligns four 16-byte Poly1305 blocks at 26-bit boundaries within 32-bit words, and /// sets the high bit for each block. /// /// The four blocks must be in the following 32-bit word layout: /// [b33, b32, b31, b30, b23, b22, b21, b20] /// [b13, b12, b11, b10, b03, b02, b01, b00] #[target_feature(enable = "avx2")] unsafe fn from_loaded_blocks(blocks_01: __m256i, blocks_23: __m256i) -> Self { // 26-bit mask on each 32-bit word. let mask_26 = _mm256_set1_epi32(0x3ffffff); // Sets bit 24 of each 32-bit word. let set_hibit = _mm256_set1_epi32(1 << 24); // - Unpack the upper and lower 64 bits: // [b33, b32, b13, b12, b23, b22, b03, b02] // [b31, b30, b11, b10, b21, b20, b01, b00] // // - Swap the middle two 64-bit words: // a0 = [b33, b32, b23, b22, b13, b12, b03, b02] // a1 = [b31, b30, b21, b20, b11, b10, b01, b00] let a0 = _mm256_permute4x64_epi64( _mm256_unpackhi_epi64(blocks_01, blocks_23), set02(3, 1, 2, 0), ); let a1 = _mm256_permute4x64_epi64( _mm256_unpacklo_epi64(blocks_01, blocks_23), set02(3, 1, 2, 0), ); // - Take the upper 24 bits of each 64-bit word in a0, and set the high bits: // v2 = [ // [0; 7] || 1 || [0; 31] || 1 || b33[32..8], // [0; 7] || 1 || [0; 31] || 1 || b23[32..8], // [0; 7] || 1 || [0; 31] || 1 || b13[32..8], // [0; 7] || 1 || [0; 31] || 1 || b03[32..8], // ] let v2 = _mm256_or_si256(_mm256_srli_epi64(a0, 40), set_hibit); // - Combine the lower 46 bits of each 64-bit word in a0 with the upper 18 // bits of each 64-bit word in a1: // a2 = [ // b33[14..0] || b32 || b31[32..14], // b23[14..0] || b22 || b21[32..14], // b13[14..0] || b12 || b11[32..14], // b03[14..0] || b02 || b01[32..14], // ] let a2 = _mm256_or_si256(_mm256_srli_epi64(a1, 46), _mm256_slli_epi64(a0, 18)); // - Take the upper 38 bits of each 64-bit word in a1: // [ // [0; 26] || b31 || b30[32..26], // [0; 26] || b21 || b20[32..26], // [0; 26] || b11 || b10[32..26], // [0; 26] || b01 || b00[32..26], // ] // - Blend in a2 on 32-bit words with alternating [a2 a1 ..] control pattern: // [ // b33[14..0] || b32[32..14] || b31[26..0] || b30[32..26], // b23[14..0] || b22[32..14] || b21[26..0] || b20[32..26], // b13[14..0] || b12[32..14] || b11[26..0] || b10[32..26], // b03[14..0] || b02[32..14] || b01[26..0] || b00[32..26], // ] // - Apply the 26-bit mask to each 32-bit word: // v1 = [ // [0; 6] || b33[8..0] || b32[32..14] || [0; 6] || b31[20..0] || b30[32..26], // [0; 6] || b23[8..0] || b22[32..14] || [0; 6] || b21[20..0] || b20[32..26], // [0; 6] || b13[8..0] || b12[32..14] || [0; 6] || b11[20..0] || b10[32..26], // [0; 6] || b03[8..0] || b02[32..14] || [0; 6] || b01[20..0] || b00[32..26], // ] let v1 = _mm256_and_si256( _mm256_blend_epi32(_mm256_srli_epi64(a1, 26), a2, 0xAA), mask_26, ); // - Take the lower 38 bits of each 64-bit word in a2: // [ // b32[20..0] || b31[32..14] || [0; 26], // b22[20..0] || b21[32..14] || [0; 26], // b12[20..0] || b11[32..14] || [0; 26], // b02[20..0] || b01[32..14] || [0; 26], // ] // - Blend in a1 on 32-bit words with alternating [a2 a1 ..] control pattern: // [ // b32[20..0] || b31[32..20] || b30, // b22[20..0] || b21[32..20] || b20, // b12[20..0] || b11[32..20] || b10, // b02[20..0] || b01[32..20] || b00, // ] // - Apply the 26-bit mask to each 32-bit word: // v0 = [ // [0; 6] || b32[14..0] || b31[32..20] || [0; 6] || b30[26..0], // [0; 6] || b22[14..0] || b21[32..20] || [0; 6] || b20[26..0], // [0; 6] || b12[14..0] || b11[32..20] || [0; 6] || b10[26..0], // [0; 6] || b02[14..0] || b01[32..20] || [0; 6] || b00[26..0], // ] let v0 = _mm256_and_si256( _mm256_blend_epi32(a1, _mm256_slli_epi64(a2, 26), 0xAA), mask_26, ); // The result: // v2 = [ v1 = [ v0 = [ // [0; 7] || 1 || [0; 24], [0; 6] || b33[ 8..0] || b32[32..14], [0; 6] || b32[14..0] || b31[32..20], // [0; 7] || 1 || b33[32..8], [0; 6] || b31[20..0] || b30[32..26], [0; 6] || b30[26..0], // [0; 7] || 1 || [0; 24], [0; 6] || b23[ 8..0] || b22[32..14], [0; 6] || b22[14..0] || b21[32..20], // [0; 7] || 1 || b23[32..8], [0; 6] || b21[20..0] || b20[32..26], [0; 6] || b20[26..0], // [0; 7] || 1 || [0; 24], [0; 6] || b13[ 8..0] || b12[32..14], [0; 6] || b12[14..0] || b11[32..20], // [0; 7] || 1 || b13[32..8], [0; 6] || b11[20..0] || b10[32..26], [0; 6] || b10[26..0], // [0; 7] || 1 || [0; 24], [0; 6] || b03[ 8..0] || b02[32..14], [0; 6] || b02[14..0] || b01[32..20], // [0; 7] || 1 || b03[32..8], [0; 6] || b01[20..0] || b00[32..26], [0; 6] || b00[26..0], // ] ] ] Aligned4x130 { v0, v1, v2 } } } impl Add for Aligned4x130 { type Output = Aligned4x130; #[inline(always)] fn add(self, other: Aligned4x130) -> Aligned4x130 { // With 26-bit limbs inside 32-bit words, there is plenty of space for unreduced // addition. unsafe { Aligned4x130 { v0: _mm256_add_epi32(self.v0, other.v0), v1: _mm256_add_epi32(self.v1, other.v1), v2: _mm256_add_epi32(self.v2, other.v2), } } } } impl Mul for &Aligned4x130 { type Output = Unreduced4x130; #[inline(always)] fn mul(self, other: PrecomputedMultiplier) -> Unreduced4x130 { unsafe { // Starting with the following limb layout: // x.v2 = [ _, x34, _, x24, _, x14, _, x04] // x.v1 = [ x33, x31, x23, x21, x13, x11, x03, x01] // x.v0 = [ x32, x30, x22, x20, x12, x10, x02, x00] // y = [5·r_4, 5·r_3, 5·r_2, r_4, r_3, r_2, r_1, r_0] // z = [5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1, 5·r_1] let mut x = *self; let y = other.a; let z = other.a_5; // Prepare a permutation that swaps the two limbs within a 64-bit window. let ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1); // t0 = [r_1, r_0, r_1, r_0, r_1, r_0, r_1, r_0] -> ·r_0 // t1 = [r_3, r_2, r_3, r_2, r_3, r_2, r_3, r_2] -> ·r_2 let mut t0 = _mm256_permute4x64_epi64(y, set02(0, 0, 0, 0)); let mut t1 = _mm256_permute4x64_epi64(y, set02(1, 1, 1, 1)); // v0 = [x30·r_0, x20·r_0, x10·r_0, x00·r_0] // v1 = [x31·r_0, x21·r_0, x11·r_0, x01·r_0] // v4 = [x34·r_0, x24·r_0, x14·r_0, x04·r_0] // v2 = [x30·r_2, x20·r_2, x10·r_2, x00·r_2] // v3 = [x31·r_2, x21·r_2, x11·r_2, x01·r_2] let mut v0 = _mm256_mul_epu32(x.v0, t0); // xN0·r_0 let mut v1 = _mm256_mul_epu32(x.v1, t0); // xN1·r_0 let mut v4 = _mm256_mul_epu32(x.v2, t0); // xN4·r_0 let mut v2 = _mm256_mul_epu32(x.v0, t1); // xN0·r_2 let mut v3 = _mm256_mul_epu32(x.v1, t1); // xN1·r_2 // t0 = [r_0, r_1, r_0, r_1, r_0, r_1, r_0, r_1] -> ·r_1 // t1 = [r_2, r_3, r_2, r_3, r_2, r_3, r_2, r_3] -> ·r_3 t0 = _mm256_permutevar8x32_epi32(t0, ord); t1 = _mm256_permutevar8x32_epi32(t1, ord); // v1 = [x31·r_0 + x30·r_1, x21·r_0 + x20·r_1, x11·r_0 + x10·r_1, x01·r_0 + x00·r_1] // v2 = [x31·r_1 + x30·r_2, x21·r_1 + x20·r_2, x11·r_1 + x10·r_2, x01·r_1 + x00·r_2] // v3 = [x31·r_2 + x30·r_3, x21·r_2 + x20·r_3, x11·r_2 + x10·r_3, x01·r_2 + x00·r_3] // v4 = [x34·r_0 + x31·r_3, x24·r_0 + x21·r_3, x14·r_0 + x11·r_3, x04·r_0 + x01·r_3] v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, t0)); // + xN0·r_1 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, t0)); // + xN1·r_1 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, t1)); // + xN0·r_3 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, t1)); // + xN1·r_3 // t2 = [5·r_2, r_4, 5·r_2, r_4, 5·r_2, r_4, 5·r_2, r_4] -> ·r_4 let mut t2 = _mm256_permute4x64_epi64(y, set02(2, 2, 2, 2)); // v4 = [ // x34·r_0 + x31·r_3 + x30·r_4, // x24·r_0 + x21·r_3 + x20·r_4, // x14·r_0 + x11·r_3 + x10·r_4, // x04·r_0 + x01·r_3 + x00·r_4, // ] v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v0, t2)); // + xN0·r_4 // x.v0 = [x30, x32, x20, x22, x10, x12, x00, x02] // x.v1 = [x31, x33, x21, x23, x11, x13, x01, x03] // t2 = [r_4, 5·r_2, r_4, 5·r_2, r_4, 5·r_2, r_4, 5·r_2] -> ·5·r_2 x.v0 = _mm256_permutevar8x32_epi32(x.v0, ord); x.v1 = _mm256_permutevar8x32_epi32(x.v1, ord); t2 = _mm256_permutevar8x32_epi32(t2, ord); // v0 = [ // x30·r_0 + 5·x33·r_2, // x20·r_0 + 5·x23·r_2, // x10·r_0 + 5·x13·r_2, // x00·r_0 + 5·x03·r_2, // ] // v1 = [ // x31·r_0 + x30·r_1 + 5·x34·r_2, // x21·r_0 + x20·r_1 + 5·x24·r_2, // x11·r_0 + x10·r_1 + 5·x14·r_2, // x01·r_0 + x00·r_1 + 5·x04·r_2, // ] // v3 = [ // x32·r_1 + x31·r_2 + x30·r_3, // x22·r_1 + x21·r_2 + x20·r_3, // x12·r_1 + x11·r_2 + x10·r_3, // x02·r_1 + x01·r_2 + x00·r_3, // ] // v4 = [ // x34·r_0 + x33·r_1 + x31·r_3 + x30·r_4, // x24·r_0 + x23·r_1 + x21·r_3 + x20·r_4, // x14·r_0 + x13·r_1 + x11·r_3 + x10·r_4, // x04·r_0 + x03·r_1 + x01·r_3 + x00·r_4, // ] v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, t2)); // + 5·xN3·r_2 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v2, t2)); // + 5·xN4·r_2 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, t0)); // + xN2·r_1 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, t0)); // + xN3·r_1 // t0 = [r_1, r_0, r_1, r_0, r_1, r_0, r_1, r_0] -> ·r_0 // t1 = [r_3, r_2, r_3, r_2, r_3, r_2, r_3, r_2] -> ·r_2 t0 = _mm256_permutevar8x32_epi32(t0, ord); t1 = _mm256_permutevar8x32_epi32(t1, ord); // v2 = [ // x32·r_0 + x31·r_1 + x30·r_2, // x22·r_0 + x21·r_1 + x20·r_2, // x12·r_0 + x11·r_1 + x10·r_2, // x02·r_0 + x01·r_1 + x00·r_2, // ] // v3 = [ // x33·r_0 + x32·r_1 + x31·r_2 + x30·r_3, // x23·r_0 + x22·r_1 + x21·r_2 + x20·r_3, // x13·r_0 + x12·r_1 + x11·r_2 + x10·r_3, // x03·r_0 + x02·r_1 + x01·r_2 + x00·r_3, // ] // v4 = [ // x34·r_0 + x33·r_1 + x32·r_2 + x31·r_3 + x30·r_4, // x24·r_0 + x23·r_1 + x22·r_2 + x21·r_3 + x20·r_4, // x14·r_0 + x13·r_1 + x12·r_2 + x11·r_3 + x10·r_4, // x04·r_0 + x03·r_1 + x02·r_2 + x01·r_3 + x00·r_4, // ] v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v0, t0)); // + xN2·r_0 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v1, t0)); // + xN3·r_0 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v0, t1)); // + xN2·r_2 // t0 = [5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3] -> ·5·r_3 t0 = _mm256_permute4x64_epi64(y, set02(3, 3, 3, 3)); // v0 = [ // x30·r_0 + 5·x33·r_2 + 5·x32·r_3, // x20·r_0 + 5·x23·r_2 + 5·x22·r_3, // x10·r_0 + 5·x13·r_2 + 5·x12·r_3, // x00·r_0 + 5·x03·r_2 + 5·x02·r_3, // ] // v1 = [ // x31·r_0 + x30·r_1 + 5·x34·r_2 + 5·x33·r_3, // x21·r_0 + x20·r_1 + 5·x24·r_2 + 5·x23·r_3, // x11·r_0 + x10·r_1 + 5·x14·r_2 + 5·x13·r_3, // x01·r_0 + x00·r_1 + 5·x04·r_2 + 5·x03·r_3, // ] // v2 = [ // x32·r_0 + x31·r_1 + x30·r_2 + 5·x34·r_3, // x22·r_0 + x21·r_1 + x20·r_2 + 5·x24·r_3, // x12·r_0 + x11·r_1 + x10·r_2 + 5·x14·r_3, // x02·r_0 + x01·r_1 + x00·r_2 + 5·x04·r_3, // ] v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v0, t0)); // + 5·xN2·r_3 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v1, t0)); // + 5·xN3·r_3 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v2, t0)); // + 5·xN4·r_3 // t0 = [5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4, 5·r_3, 5·r_4] -> ·5·r_4 t0 = _mm256_permutevar8x32_epi32(t0, ord); // v1 = [ // x31·r_0 + x30·r_1 + 5·x34·r_2 + 5·x33·r_3 + 5·x32·r_4, // x21·r_0 + x20·r_1 + 5·x24·r_2 + 5·x23·r_3 + 5·x22·r_4, // x11·r_0 + x10·r_1 + 5·x14·r_2 + 5·x13·r_3 + 5·x12·r_4, // x01·r_0 + x00·r_1 + 5·x04·r_2 + 5·x03·r_3 + 5·x02·r_4, // ] // v2 = [ // x32·r_0 + x31·r_1 + x30·r_2 + 5·x34·r_3 + 5·x33·r_4, // x22·r_0 + x21·r_1 + x20·r_2 + 5·x24·r_3 + 5·x23·r_4, // x12·r_0 + x11·r_1 + x10·r_2 + 5·x14·r_3 + 5·x13·r_4, // x02·r_0 + x01·r_1 + x00·r_2 + 5·x04·r_3 + 5·x03·r_4, // ] // v3 = [ // x33·r_0 + x32·r_1 + x31·r_2 + x30·r_3 + 5·x34·r_4, // x23·r_0 + x22·r_1 + x21·r_2 + x20·r_3 + 5·x24·r_4, // x13·r_0 + x12·r_1 + x11·r_2 + x10·r_3 + 5·x14·r_4, // x03·r_0 + x02·r_1 + x01·r_2 + x00·r_3 + 5·x04·r_4, // ] v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, t0)); // + 5·xN2·r_4 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, t0)); // + 5·xN3·r_4 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v2, t0)); // + 5·xN4·r_4 // x.v1 = [x33, x31, x23, x21, x13, x11, x03, x01] x.v1 = _mm256_permutevar8x32_epi32(x.v1, ord); // v0 = [ // x30·r_0 + 5·x34·r_1 + 5·x33·r_2 + 5·x32·r_3 + 5·x31·r_4, // x20·r_0 + 5·x24·r_1 + 5·x23·r_2 + 5·x22·r_3 + 5·x21·r_4, // x10·r_0 + 5·x14·r_1 + 5·x13·r_2 + 5·x12·r_3 + 5·x11·r_4, // x00·r_0 + 5·x04·r_1 + 5·x03·r_2 + 5·x02·r_3 + 5·x01·r_4, // ] v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, t0)); // + 5·xN1·r_4 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v2, z)); // + 5·xN4·r_1 // The result: // v4 = [ // x34·r_0 + x33·r_1 + x32·r_2 + x31·r_3 + x30·r_4, // x24·r_0 + x23·r_1 + x22·r_2 + x21·r_3 + x20·r_4, // x14·r_0 + x13·r_1 + x12·r_2 + x11·r_3 + x10·r_4, // x04·r_0 + x03·r_1 + x02·r_2 + x01·r_3 + x00·r_4, // ] // v3 = [ // x33·r_0 + x32·r_1 + x31·r_2 + x30·r_3 + 5·x34·r_4, // x23·r_0 + x22·r_1 + x21·r_2 + x20·r_3 + 5·x24·r_4, // x13·r_0 + x12·r_1 + x11·r_2 + x10·r_3 + 5·x14·r_4, // x03·r_0 + x02·r_1 + x01·r_2 + x00·r_3 + 5·x04·r_4, // ] // v2 = [ // x32·r_0 + x31·r_1 + x30·r_2 + 5·x34·r_3 + 5·x33·r_4, // x22·r_0 + x21·r_1 + x20·r_2 + 5·x24·r_3 + 5·x23·r_4, // x12·r_0 + x11·r_1 + x10·r_2 + 5·x14·r_3 + 5·x13·r_4, // x02·r_0 + x01·r_1 + x00·r_2 + 5·x04·r_3 + 5·x03·r_4, // ] // v1 = [ // x31·r_0 + x30·r_1 + 5·x34·r_2 + 5·x33·r_3 + 5·x32·r_4, // x21·r_0 + x20·r_1 + 5·x24·r_2 + 5·x23·r_3 + 5·x22·r_4, // x11·r_0 + x10·r_1 + 5·x14·r_2 + 5·x13·r_3 + 5·x12·r_4, // x01·r_0 + x00·r_1 + 5·x04·r_2 + 5·x03·r_3 + 5·x02·r_4, // ] // v0 = [ // x30·r_0 + 5·x34·r_1 + 5·x33·r_2 + 5·x32·r_3 + 5·x31·r_4, // x20·r_0 + 5·x24·r_1 + 5·x23·r_2 + 5·x22·r_3 + 5·x21·r_4, // x10·r_0 + 5·x14·r_1 + 5·x13·r_2 + 5·x12·r_3 + 5·x11·r_4, // x00·r_0 + 5·x04·r_1 + 5·x03·r_2 + 5·x02·r_3 + 5·x01·r_4, // ] Unreduced4x130 { v0, v1, v2, v3, v4 } } } } impl Mul for Aligned4x130 { type Output = Unreduced4x130; #[inline(always)] fn mul(self, m: SpacedMultiplier4x130) -> Unreduced4x130 { unsafe { // Starting with the following limb layout: // x.v2 = [ _, x34, _, x24, _, x14, _, x04] // x.v1 = [ x33, x31, x23, x21, x13, x11, x03, x01] // x.v0 = [ x32, x30, x22, x20, x12, x10, x02, x00] // m.v1 = [ r2_4, r2_2, r2_0, r4_4, r4_3, r4_2, r4_1, r4_0] // m.v0 = [ r2_4, r2_3, r2_1, r3_4, r3_3, r3_2, r3_1, r3_0] // r1 = [5·r1_4, 5·r1_3, 5·r1_2, r1_4, r1_3, r1_2, r1_1, r1_0] let mut x = self; let r1 = m.r1.a; // v0 = [r2_0, r2_1, r4_4, r3_4, r4_1, r3_1, r4_0, r3_0] // v1 = [r2_4, r2_4, r2_2, r2_3, r4_3, r3_3, r4_2, r3_2] let v0 = _mm256_unpacklo_epi32(m.v0, m.v1); let v1 = _mm256_unpackhi_epi32(m.v0, m.v1); // m_r_0 = [r1_1, r1_0, r2_1, r2_0, r3_1, r3_0, r4_1, r4_0] -> ·rN_0 // m_r_2 = [r1_3, r1_2, r2_3, r2_2, r3_3, r3_2, r4_3, r4_2] -> ·rN_2 // m_r_4 = [r1_1, r1_4, r2_1, r2_4, r3_1, r3_4, r4_1, r4_4] -> ·rN_4 let ord = _mm256_set_epi32(1, 0, 6, 7, 2, 0, 3, 1); let m_r_0 = _mm256_blend_epi32( _mm256_permutevar8x32_epi32(r1, ord), _mm256_permutevar8x32_epi32(v0, ord), 0b00111111, ); let ord = _mm256_set_epi32(3, 2, 4, 5, 2, 0, 3, 1); let m_r_2 = _mm256_blend_epi32( _mm256_permutevar8x32_epi32(r1, ord), _mm256_permutevar8x32_epi32(v1, ord), 0b00111111, ); let ord = _mm256_set_epi32(1, 4, 6, 6, 2, 4, 3, 5); let m_r_4 = _mm256_blend_epi32( _mm256_blend_epi32( _mm256_permutevar8x32_epi32(r1, ord), _mm256_permutevar8x32_epi32(v1, ord), 0b00010000, ), _mm256_permutevar8x32_epi32(v0, ord), 0b00101111, ); // v0 = [x30·r1_0, x20·r2_0, x10·r3_0, x00·r4_0] // v1 = [x31·r1_0, x21·r2_0, x11·r3_0, x01·r4_0] // v2 = [x30·r1_2, x20·r2_2, x10·r3_2, x00·r4_2] // v3 = [x31·r1_2, x21·r2_2, x11·r3_2, x01·r4_2] // v4 = [x30·r1_4, x20·r2_4, x10·r3_4, x00·r4_4] let mut v0 = _mm256_mul_epu32(x.v0, m_r_0); // xM0·rN_0 let mut v1 = _mm256_mul_epu32(x.v1, m_r_0); // xM1·rN_0 let mut v2 = _mm256_mul_epu32(x.v0, m_r_2); // xM0·rN_2 let mut v3 = _mm256_mul_epu32(x.v1, m_r_2); // xM1·rN_2 let mut v4 = _mm256_mul_epu32(x.v0, m_r_4); // xM0·rN_4 // m_r_1 = [r1_0, r1_1, r2_0, r2_1, r3_0, r3_1, r4_0, r4_1] -> ·rN_1 // m_r_3 = [r1_2, r1_3, r2_2, r2_3, r3_2, r3_3, r4_2, r4_3] -> ·rN_3 let ord = _mm256_set_epi32(6, 7, 4, 5, 2, 3, 0, 1); let m_r_1 = _mm256_permutevar8x32_epi32(m_r_0, ord); let m_r_3 = _mm256_permutevar8x32_epi32(m_r_2, ord); // v1 = [ // x31·r1_0 + x30·r1_1, // x21·r2_0 + x20·r2_1, // x11·r3_0 + x10·r3_1, // x01·r4_0 + x00·r4_1, // ] // v2 = [ // x31·r1_1 + x30·r1_2, // x21·r2_1 + x20·r2_2, // x11·r3_1 + x10·r3_2, // x01·r4_1 + x00·r4_2, // ] // v3 = [ // x31·r1_2 + x30·r1_3, // x21·r2_2 + x20·r2_3, // x11·r3_2 + x10·r3_3, // x01·r4_2 + x00·r4_3, // ] // v4 = [ // x34·r1_0 + x31·r1_3 + x30·r1_4, // x24·r2_0 + x21·r2_3 + x20·r2_4, // x14·r3_0 + x11·r3_3 + x10·r3_4, // x04·r4_0 + x01·r4_3 + x00·r4_4, // ] v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, m_r_1)); // + xM0·rN_1 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, m_r_1)); // + xM1·rN_1 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, m_r_3)); // + xM0·rN_3 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, m_r_3)); // + xM1·rN_3 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v2, m_r_0)); // + xM4·rN_0 // x.v0 = [x30, x32, x20, x22, x10, x12, x00, x02] x.v0 = _mm256_permutevar8x32_epi32(x.v0, ord); // v2 = [ // x32·r1_0 + x31·r1_1 + x30·r1_2, // x22·r2_0 + x21·r2_1 + x20·r2_2, // x12·r3_0 + x11·r3_1 + x10·r3_2, // x02·r4_0 + x01·r4_1 + x00·r4_2, // ] // v3 = [ // x32·r1_1 + x31·r1_2 + x30·r1_3, // x22·r2_1 + x21·r2_2 + x20·r2_3, // x12·r3_1 + x11·r3_2 + x10·r3_3, // x02·r4_1 + x01·r4_2 + x00·r4_3, // ] // v4 = [ // x34·r1_0 + x32·r1_2 + x31·r1_3 + x30·r1_4, // x24·r2_0 + x22·r2_2 + x21·r2_3 + x20·r2_4, // x14·r3_0 + x12·r3_2 + x11·r3_3 + x10·r3_4, // x04·r4_0 + x02·r4_2 + x01·r4_3 + x00·r4_4, // ] v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v0, m_r_0)); // + xM2·rN_0 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v0, m_r_1)); // + xM2·rN_1 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v0, m_r_2)); // + xM2·rN_2 // m_5r_3 = [5·r1_2, 5·r1_3, 5·r2_2, 5·r2_3, 5·r3_2, 5·r3_3, 5·r4_2, 5·r4_3] -> ·5·rN_3 // m_5r_4 = [5·r1_1, 5·r1_4, 5·r2_1, 5·r2_4, 5·r3_1, 5·r3_4, 5·r4_1, 5·r4_4] -> ·5·rN_4 let m_5r_3 = _mm256_add_epi32(m_r_3, _mm256_slli_epi32(m_r_3, 2)); let m_5r_4 = _mm256_add_epi32(m_r_4, _mm256_slli_epi32(m_r_4, 2)); // v0 = [ // x30·r1_0 + 5·x32·r1_3 + 5·x31·r1_4, // x20·r2_0 + 5·x22·r2_3 + 5·x21·r2_4, // x10·r3_0 + 5·x12·r3_3 + 5·x11·r3_4, // x00·r4_0 + 5·x02·r4_3 + 5·x01·r4_4, // ] // v1 = [ // x31·r1_0 + x30·r1_1 + 5·x32·r1_4, // x21·r2_0 + x20·r2_1 + 5·x22·r2_4, // x11·r3_0 + x10·r3_1 + 5·x12·r3_4, // x01·r4_0 + x00·r4_1 + 5·x02·r4_4, // ] // v2 = [ // x32·r1_0 + x31·r1_1 + x30·r1_2 + 5·x34·r1_3, // x22·r2_0 + x21·r2_1 + x20·r2_2 + 5·x24·r2_3, // x12·r3_0 + x11·r3_1 + x10·r3_2 + 5·x14·r3_3, // x02·r4_0 + x01·r4_1 + x00·r4_2 + 5·x04·r4_3, // ] // v3 = [ // x32·r1_1 + x31·r1_2 + x30·r1_3 + 5·x34·r1_4, // x22·r2_1 + x21·r2_2 + x20·r2_3 + 5·x24·r2_4, // x12·r3_1 + x11·r3_2 + x10·r3_3 + 5·x14·r3_4, // x02·r4_1 + x01·r4_2 + x00·r4_3 + 5·x04·r4_4, // ] v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v0, m_5r_3)); // + 5·xM2·rN_3 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, m_5r_4)); // + 5·xM1·rN_4 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v0, m_5r_4)); // + 5·xM2·rN_4 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v2, m_5r_3)); // + 5·xM4·rN_3 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v2, m_5r_4)); // + 5·xM4·rN_4 // x.v1 = [x31, x33, x21, x23, x11, x13, x01, x03] x.v1 = _mm256_permutevar8x32_epi32(x.v1, ord); // v1 = [ // x31·r1_0 + x30·r1_1 + 5·x33·r1_3 + 5·x32·r1_4, // x21·r2_0 + x20·r2_1 + 5·x23·r2_3 + 5·x22·r2_4, // x11·r3_0 + x10·r3_1 + 5·x13·r3_3 + 5·x12·r3_4, // x01·r4_0 + x00·r4_1 + 5·x03·r4_3 + 5·x02·r4_4, // ] // v2 = [ // x32·r1_0 + x31·r1_1 + x30·r1_2 + 5·x34·r1_3 + 5·x33·r1_4, // x22·r2_0 + x21·r2_1 + x20·r2_2 + 5·x24·r2_3 + 5·x23·r2_4, // x12·r3_0 + x11·r3_1 + x10·r3_2 + 5·x14·r3_3 + 5·x13·r3_4, // x02·r4_0 + x01·r4_1 + x00·r4_2 + 5·x04·r4_3 + 5·x03·r4_4, // ] // v3 = [ // x33·r1_0 + x32·r1_1 + x31·r1_2 + x30·r1_3 + 5·x34·r1_4, // x23·r2_0 + x22·r2_1 + x21·r2_2 + x20·r2_3 + 5·x24·r2_4, // x13·r3_0 + x12·r3_1 + x11·r3_2 + x10·r3_3 + 5·x14·r3_4, // x03·r4_0 + x02·r4_1 + x01·r4_2 + x00·r4_3 + 5·x04·r4_4, // ] // v4 = [ // x34·r1_0 + x33·r1_1 + x32·r1_2 + x31·r1_3 + x30·r1_4, // x24·r2_0 + x23·r2_1 + x22·r2_2 + x21·r2_3 + x20·r2_4, // x14·r3_0 + x13·r3_1 + x12·r3_2 + x11·r3_3 + x10·r3_4, // x04·r4_0 + x03·r4_1 + x02·r4_2 + x01·r4_3 + x00·r4_4, // ] v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v1, m_5r_3)); // + 5·xM3·rN_3 v2 = _mm256_add_epi64(v2, _mm256_mul_epu32(x.v1, m_5r_4)); // + 5·xM3·rN_4 v3 = _mm256_add_epi64(v3, _mm256_mul_epu32(x.v1, m_r_0)); // + xM3·rN_0 v4 = _mm256_add_epi64(v4, _mm256_mul_epu32(x.v1, m_r_1)); // + xM3·rN_1 // m_5r_1 = [5·r1_4, 5·r1_1, 5·r2_4, 5·r2_1, 5·r3_4, 5·r3_1, 5·r4_4, 5·r4_1] -> ·5·rN_1 // m_5r_2 = [5·r1_3, 5·r1_2, 5·r2_3, 5·r2_2, 5·r3_3, 5·r3_2, 5·r4_3, 5·r4_2] -> ·5·rN_2 let m_5r_1 = _mm256_permutevar8x32_epi32(m_5r_4, ord); let m_5r_2 = _mm256_permutevar8x32_epi32(m_5r_3, ord); // v0 = [ // x30·r1_0 + 5·x34·r1_1 + 5·x33·r1_2 + 5·x32·r1_3 + 5·x31·r1_4, // x20·r2_0 + 5·x24·r2_1 + 5·x23·r2_2 + 5·x22·r2_3 + 5·x21·r2_4, // x10·r3_0 + 5·x14·r3_1 + 5·x13·r3_2 + 5·x12·r3_3 + 5·x11·r3_4, // x00·r4_0 + 5·x04·r4_1 + 5·x03·r4_2 + 5·x02·r4_3 + 5·x01·r4_4, // ] // v1 = [ // x31·r1_0 + x30·r1_1 + 5·x34·r1_2 + 5·x33·r1_3 + 5·x32·r1_4, // x21·r2_0 + x20·r2_1 + 5·x24·r2_2 + 5·x23·r2_3 + 5·x22·r2_4, // x11·r3_0 + x10·r3_1 + 5·x14·r3_2 + 5·x13·r3_3 + 5·x12·r3_4, // x01·r4_0 + x00·r4_1 + 5·x04·r4_2 + 5·x03·r4_3 + 5·x02·r4_4, // ] v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v1, m_5r_2)); // + 5·xM3·rN_2 v0 = _mm256_add_epi64(v0, _mm256_mul_epu32(x.v2, m_5r_1)); // + 5·xM4·rN_1 v1 = _mm256_add_epi64(v1, _mm256_mul_epu32(x.v2, m_5r_2)); // + 5·xM4·rN_2 // The result: // v4 = [ // x34·r1_0 + x33·r1_1 + x32·r1_2 + x31·r1_3 + x30·r1_4, // x24·r2_0 + x23·r2_1 + x22·r2_2 + x21·r2_3 + x20·r2_4, // x14·r3_0 + x13·r3_1 + x12·r3_2 + x11·r3_3 + x10·r3_4, // x04·r4_0 + x03·r4_1 + x02·r4_2 + x01·r4_3 + x00·r4_4, // ] // v3 = [ // x33·r1_0 + x32·r1_1 + x31·r1_2 + x30·r1_3 + 5·x34·r1_4, // x23·r2_0 + x22·r2_1 + x21·r2_2 + x20·r2_3 + 5·x24·r2_4, // x13·r3_0 + x12·r3_1 + x11·r3_2 + x10·r3_3 + 5·x14·r3_4, // x03·r4_0 + x02·r4_1 + x01·r4_2 + x00·r4_3 + 5·x04·r4_4, // ] // v2 = [ // x32·r1_0 + x31·r1_1 + x30·r1_2 + 5·x34·r1_3 + 5·x33·r1_4, // x22·r2_0 + x21·r2_1 + x20·r2_2 + 5·x24·r2_3 + 5·x23·r2_4, // x12·r3_0 + x11·r3_1 + x10·r3_2 + 5·x14·r3_3 + 5·x13·r3_4, // x02·r4_0 + x01·r4_1 + x00·r4_2 + 5·x04·r4_3 + 5·x03·r4_4, // ] // v1 = [ // x31·r1_0 + x30·r1_1 + 5·x34·r1_2 + 5·x33·r1_3 + 5·x32·r1_4, // x21·r2_0 + x20·r2_1 + 5·x24·r2_2 + 5·x23·r2_3 + 5·x22·r2_4, // x11·r3_0 + x10·r3_1 + 5·x14·r3_2 + 5·x13·r3_3 + 5·x12·r3_4, // x01·r4_0 + x00·r4_1 + 5·x04·r4_2 + 5·x03·r4_3 + 5·x02·r4_4, // ] // v0 = [ // x30·r1_0 + 5·x34·r1_1 + 5·x33·r1_2 + 5·x32·r1_3 + 5·x31·r1_4, // x20·r2_0 + 5·x24·r2_1 + 5·x23·r2_2 + 5·x22·r2_3 + 5·x21·r2_4, // x10·r3_0 + 5·x14·r3_1 + 5·x13·r3_2 + 5·x12·r3_3 + 5·x11·r3_4, // x00·r4_0 + 5·x04·r4_1 + 5·x03·r4_2 + 5·x02·r4_3 + 5·x01·r4_4, // ] Unreduced4x130 { v0, v1, v2, v3, v4 } } } } /// The unreduced output of an Aligned4x130 multiplication. #[derive(Clone, Debug)] pub(super) struct Unreduced4x130 { v0: __m256i, v1: __m256i, v2: __m256i, v3: __m256i, v4: __m256i, } impl fmt::Display for Unreduced4x130 { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut v0 = [0u8; 32]; let mut v1 = [0u8; 32]; let mut v2 = [0u8; 32]; let mut v3 = [0u8; 32]; let mut v4 = [0u8; 32]; unsafe { _mm256_storeu_si256(v0.as_mut_ptr() as *mut _, self.v0); _mm256_storeu_si256(v1.as_mut_ptr() as *mut _, self.v1); _mm256_storeu_si256(v2.as_mut_ptr() as *mut _, self.v2); _mm256_storeu_si256(v3.as_mut_ptr() as *mut _, self.v3); _mm256_storeu_si256(v4.as_mut_ptr() as *mut _, self.v4); } writeln!(f, "Unreduced4x130([")?; write!(f, " ")?; write_130_wide( f, [ u64::from_le_bytes(v0[0..8].try_into().unwrap()), u64::from_le_bytes(v1[0..8].try_into().unwrap()), u64::from_le_bytes(v2[0..8].try_into().unwrap()), u64::from_le_bytes(v3[0..8].try_into().unwrap()), u64::from_le_bytes(v4[0..8].try_into().unwrap()), ], )?; writeln!(f, ",")?; write!(f, " ")?; write_130_wide( f, [ u64::from_le_bytes(v0[8..16].try_into().unwrap()), u64::from_le_bytes(v1[8..16].try_into().unwrap()), u64::from_le_bytes(v2[8..16].try_into().unwrap()), u64::from_le_bytes(v3[8..16].try_into().unwrap()), u64::from_le_bytes(v4[8..16].try_into().unwrap()), ], )?; writeln!(f, ",")?; write!(f, " ")?; write_130_wide( f, [ u64::from_le_bytes(v0[16..24].try_into().unwrap()), u64::from_le_bytes(v1[16..24].try_into().unwrap()), u64::from_le_bytes(v2[16..24].try_into().unwrap()), u64::from_le_bytes(v3[16..24].try_into().unwrap()), u64::from_le_bytes(v4[16..24].try_into().unwrap()), ], )?; writeln!(f, ",")?; write!(f, " ")?; write_130_wide( f, [ u64::from_le_bytes(v0[24..32].try_into().unwrap()), u64::from_le_bytes(v1[24..32].try_into().unwrap()), u64::from_le_bytes(v2[24..32].try_into().unwrap()), u64::from_le_bytes(v3[24..32].try_into().unwrap()), u64::from_le_bytes(v4[24..32].try_into().unwrap()), ], )?; writeln!(f, ",")?; write!(f, "])") } } impl Unreduced4x130 { #[inline(always)] pub(super) fn reduce(self) -> Aligned4x130 { unsafe { // Starting with the following limb layout across 64-bit words: // x.v4 = [u34, u24, u14, u04] // x.v3 = [u33, u23, u13, u03] // x.v2 = [u32, u22, u12, u02] // x.v1 = [u31, u21, u11, u01] // x.v0 = [u30, u20, u10, u00] let x = self; // 26-bit mask on each 64-bit word. let mask_26 = _mm256_set1_epi64x(0x3ffffff); // Carry from x0 up into x1, returning their new values. let adc = |x1: __m256i, x0: __m256i| -> (__m256i, __m256i) { let y1 = _mm256_add_epi64(x1, _mm256_srli_epi64(x0, 26)); let y0 = _mm256_and_si256(x0, mask_26); (y1, y0) }; // Reduce modulo 2^130 - 5 from x4 down into x0, returning their new values. let red = |x4: __m256i, x0: __m256i| -> (__m256i, __m256i) { let y0 = _mm256_add_epi64( x0, _mm256_mul_epu32(_mm256_srli_epi64(x4, 26), _mm256_set1_epi64x(5)), ); let y4 = _mm256_and_si256(x4, mask_26); (y4, y0) }; // Reduce the four integers in parallel to below 2^130. let (red_1, red_0) = adc(x.v1, x.v0); let (red_4, red_3) = adc(x.v4, x.v3); let (red_2, red_1) = adc(x.v2, red_1); let (red_4, red_0) = red(red_4, red_0); let (red_3, red_2) = adc(red_3, red_2); let (red_1, red_0) = adc(red_1, red_0); let (red_4, red_3) = adc(red_4, red_3); // At this point, all limbs are contained within the lower 32 bits of each // 64-bit word. The upper limb of each integer (in red_4) is positioned // correctly for Aligned4x130, but the other limbs need to be blended // together: // - v0 contains limbs 0 and 2. // - v1 contains limbs 1 and 3. Aligned4x130 { v0: _mm256_blend_epi32(red_0, _mm256_slli_epi64(red_2, 32), 0b10101010), v1: _mm256_blend_epi32(red_1, _mm256_slli_epi64(red_3, 32), 0b10101010), v2: red_4, } } } /// Returns the unreduced sum of the four 130-bit integers. #[inline(always)] pub(super) fn sum(self) -> Unreduced130 { unsafe { // Starting with the following limb layout across 64-bit words: // x.v4 = [u34, u24, u14, u04] // x.v3 = [u33, u23, u13, u03] // x.v2 = [u32, u22, u12, u02] // x.v1 = [u31, u21, u11, u01] // x.v0 = [u30, u20, u10, u00] let x = self; // v0 = [ // u31 + u21, // u30 + u20, // u11 + u01, // u10 + u00, // ] let v0 = _mm256_add_epi64( _mm256_unpackhi_epi64(x.v0, x.v1), _mm256_unpacklo_epi64(x.v0, x.v1), ); // v1 = [ // u33 + u23, // u32 + u22, // u13 + u03, // u12 + u02, // ] let v1 = _mm256_add_epi64( _mm256_unpackhi_epi64(x.v2, x.v3), _mm256_unpacklo_epi64(x.v2, x.v3), ); // v0 = [ // u33 + u23 + u13 + u03, // u32 + u22 + u12 + u02, // u31 + u21 + u11 + u01, // u30 + u20 + u10 + u00, // ] let v0 = _mm256_add_epi64( _mm256_inserti128_si256(v0, _mm256_castsi256_si128(v1), 1), _mm256_inserti128_si256(v1, _mm256_extractf128_si256(v0, 1), 0), ); // v1 = [ // u34 + u14, // u24 + u04, // u14 + u34, // u04 + u24, // ] let v1 = _mm256_add_epi64(x.v4, _mm256_permute4x64_epi64(x.v4, set02(1, 0, 3, 2))); // v1 = [ // u34 + u24 + u14 + u04, // u24 + u24 + u04 + u04, // u34 + u24 + u14 + u04, // u34 + u24 + u14 + u04, // ] let v1 = _mm256_add_epi64(v1, _mm256_permute4x64_epi64(v1, set02(0, 0, 0, 1))); // The result: // v1 = [ // u34 + u24 + u14 + u04, // u24 + u24 + u04 + u04, // u34 + u24 + u14 + u04, // u34 + u24 + u14 + u04, // ] // v0 = [ // u33 + u23 + u13 + u03, // u32 + u22 + u12 + u02, // u31 + u21 + u11 + u01, // u30 + u20 + u10 + u00, // ] // This corresponds to: // v1 = [ _, _, _, t_4] // v0 = [t_3, t_2, t_1, t_0] Unreduced130 { v0, v1 } } } } #[derive(Clone, Copy, Debug)] pub(super) struct AdditionKey(__m256i); impl Add for AdditionKey { type Output = IntegerTag; /// Computes x + k mod 2^128 #[inline(always)] fn add(self, x: Aligned130) -> IntegerTag { unsafe { // Starting with the following limb layout: // x = [0, _, _, x4, x3, x2, x1, x0] // k = [0, k7, 0, k6, 0, k5, 0, k4] let mut x = _mm256_and_si256(x.0, _mm256_set_epi32(0, 0, 0, -1, -1, -1, -1, -1)); let k = self.0; /// Reduce to an integer below 2^130. unsafe fn propagate_carry(x: __m256i) -> __m256i { // t = [ // 0, // 0, // 0, // x3 >> 26, // x2 >> 26, // x1 >> 26, // x0 >> 26, // x4 >> 26, // ]; let t = _mm256_permutevar8x32_epi32( _mm256_srli_epi32(x, 26), _mm256_set_epi32(7, 7, 7, 3, 2, 1, 0, 4), ); // [ // 0, // 0, // 0, // x4 % 2^26, // x3 % 2^26, // x2 % 2^26, // x1 % 2^26, // x0 % 2^26, // ] // + t + [0, 0, 0, 0, 0, 0, 0, 4·(x4 >> 26)] // = [ // 0, // 0, // 0, // x4 % 2^26 + x3 >> 26, // x3 % 2^26 + x2 >> 26, // x2 % 2^26 + x1 >> 26, // x1 % 2^26 + x0 >> 26, // x0 % 2^26 + 5·(x4 >> 26), // ] => [0, 0, 0, x4, x3, x2, x1, x0] _mm256_add_epi32( _mm256_add_epi32( _mm256_and_si256( x, _mm256_set_epi32( 0, 0, 0, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, 0x3ffffff, ), ), t, ), _mm256_permutevar8x32_epi32( _mm256_slli_epi32(t, 2), _mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 0), ), ) } // Reduce modulus 2^130-5: // - Reduce to an integer below 2^130: // TODO: Is it more efficient to unpack the limbs for this? for _ in 0..5 { x = propagate_carry(x); } // - Compute x + -p by adding 5 and carrying up to the top limb: // g = [0, 0, 0, g4, g3, g2, g1, g0] let mut g = _mm256_add_epi32(x, _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 5)); // TODO: Is it more efficient to unpack the limbs for this? for _ in 0..4 { g = propagate_carry(g); } let g = _mm256_sub_epi32(g, _mm256_set_epi32(0, 0, 0, 1 << 26, 0, 0, 0, 0)); // - Check whether g4 overflowed: let mask = _mm256_permutevar8x32_epi32( _mm256_sub_epi32(_mm256_srli_epi32(g, 32 - 1), _mm256_set1_epi32(1)), _mm256_set1_epi32(4), ); // - Select x if g4 overflowed, else g: let x = _mm256_or_si256( _mm256_and_si256(x, _mm256_xor_si256(mask, _mm256_set1_epi32(-1))), _mm256_and_si256(g, mask), ); // Align back to 32 bits per digit. We drop the top two bits of the top limb, // because we only care about the lower 128 bits from here onward, and don't // need to track overflow or reduce. // [ // 0, // 0, // 0, // 0, // x4[24..0] || x3[26..18], // x3[18..0] || x2[26..12], // x2[12..0] || x1[26.. 6], // x1[ 6..0] || x0[26.. 0], // ] let x = _mm256_or_si256( _mm256_srlv_epi32(x, _mm256_set_epi32(32, 32, 32, 32, 18, 12, 6, 0)), _mm256_permutevar8x32_epi32( _mm256_sllv_epi32(x, _mm256_set_epi32(32, 32, 32, 8, 14, 20, 26, 32)), _mm256_set_epi32(7, 7, 7, 7, 4, 3, 2, 1), ), ); // Add key // [ // (x4[24..0] || x3[26..18]) + k7, // (x3[18..0] || x2[26..12]) + k6, // (x2[12..0] || x1[26.. 6]) + k5, // (x1[ 6..0] || x0[26.. 0]) + k4, // ] let mut x = _mm256_add_epi64( _mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 3, 7, 2, 7, 1, 7, 0)), k, ); // Ensure that all carries are handled unsafe fn propagate_carry_32(x: __m256i) -> __m256i { // [ // (l4 % 2^32) + (l3 >> 32), // (l3 % 2^32) + (l2 >> 32), // (l2 % 2^32) + (l1 >> 32), // (l1 % 2^32), // ] _mm256_add_epi64( _mm256_and_si256(x, _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1)), _mm256_permute4x64_epi64( _mm256_and_si256( _mm256_srli_epi64(x, 32), _mm256_set_epi64x(0, -1, -1, -1), ), set02(2, 1, 0, 3), ), ) } for _ in 0..3 { x = propagate_carry_32(x); } // Now that all limbs are at most 32 bits, realign from 64- to 32-bit limbs. // [ // 0, // 0, // 0, // 0, // ((x4[24..0] || x3[26..18]) + k7) % 2^32 + ((x3[18..0] || x2[26..12]) + k6) >> 32, // ((x3[18..0] || x2[26..12]) + k6) % 2^32 + ((x2[12..0] || x1[26.. 6]) + k5) >> 32, // ((x2[12..0] || x1[26.. 6]) + k5) % 2^32 + ((x1[ 6..0] || x0[26.. 0]) + k4) >> 32, // ((x1[ 6..0] || x0[26.. 0]) + k4) % 2^32, // ] let x = _mm256_permutevar8x32_epi32(x, _mm256_set_epi32(7, 7, 7, 7, 6, 4, 2, 0)); // Reduce modulus 2^128 IntegerTag(_mm256_castsi256_si128(x)) } } } pub(super) struct IntegerTag(__m128i); impl From for IntegerTag { fn from(k: AdditionKey) -> Self { unsafe { // There was no polynomial to add. IntegerTag(_mm256_castsi256_si128(_mm256_permutevar8x32_epi32( k.0, _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0), ))) } } } impl IntegerTag { pub(super) fn write(self, tag: &mut [u8]) { unsafe { _mm_storeu_si128(tag.as_mut_ptr() as *mut _, self.0); } } } poly1305-0.8.0/src/backend/avx2.rs000064400000000000000000000144660072674642500145740ustar 00000000000000//! AVX2 implementation of the Poly1305 state machine. // The State struct and its logic was originally derived from Goll and Gueron's AVX2 C // code: // [Vectorization of Poly1305 message authentication code](https://ieeexplore.ieee.org/document/7113463) // // which was sourced from Bhattacharyya and Sarkar's modified variant: // [Improved SIMD Implementation of Poly1305](https://eprint.iacr.org/2019/842) // https://github.com/Sreyosi/Improved-SIMD-Implementation-of-Poly1305 // // The logic has been extensively rewritten and documented, and several bugs in the // original C code were fixed. // // Note that State only implements the original Goll-Gueron algorithm, not the // optimisations provided by Bhattacharyya and Sarkar. The latter require the message // length to be known, which is incompatible with the streaming API of UniversalHash. use universal_hash::{ consts::{U16, U4}, crypto_common::{BlockSizeUser, ParBlocksSizeUser}, generic_array::GenericArray, UhfBackend, }; use crate::{Block, Key, Tag}; mod helpers; use self::helpers::*; /// Four Poly1305 blocks (64-bytes) type ParBlocks = universal_hash::ParBlocks; #[derive(Copy, Clone)] struct Initialized { p: Aligned4x130, m: SpacedMultiplier4x130, r4: PrecomputedMultiplier, } #[derive(Clone)] pub(crate) struct State { k: AdditionKey, r1: PrecomputedMultiplier, r2: PrecomputedMultiplier, initialized: Option, cached_blocks: [Block; 4], num_cached_blocks: usize, partial_block: Option, } impl State { /// Initialize Poly1305 [`State`] with the given key pub(crate) fn new(key: &Key) -> Self { // Prepare addition key and polynomial key. let (k, r1) = unsafe { prepare_keys(key) }; // Precompute R^2. let r2 = (r1 * r1).reduce(); State { k, r1, r2: r2.into(), initialized: None, cached_blocks: [Block::default(); 4], num_cached_blocks: 0, partial_block: None, } } /// Process four Poly1305 blocks at once. #[target_feature(enable = "avx2")] pub(crate) unsafe fn compute_par_blocks(&mut self, blocks: &ParBlocks) { assert!(self.partial_block.is_none()); assert_eq!(self.num_cached_blocks, 0); self.process_blocks(Aligned4x130::from_par_blocks(blocks)); } /// Compute a Poly1305 block #[target_feature(enable = "avx2")] pub(crate) unsafe fn compute_block(&mut self, block: &Block, partial: bool) { // We can cache a single partial block. if partial { assert!(self.partial_block.is_none()); self.partial_block = Some(*block); return; } self.cached_blocks[self.num_cached_blocks].copy_from_slice(block); if self.num_cached_blocks < 3 { self.num_cached_blocks += 1; return; } else { self.num_cached_blocks = 0; } self.process_blocks(Aligned4x130::from_blocks(&self.cached_blocks)); } /// Compute a Poly1305 block #[target_feature(enable = "avx2")] unsafe fn process_blocks(&mut self, blocks: Aligned4x130) { if let Some(inner) = &mut self.initialized { // P <-- R^4 * P + blocks inner.p = (&inner.p * inner.r4).reduce() + blocks; } else { // Initialize the polynomial. let p = blocks; // Initialize the multiplier (used to merge down the polynomial during // finalization). let (m, r4) = SpacedMultiplier4x130::new(self.r1, self.r2); self.initialized = Some(Initialized { p, m, r4 }) } } /// Finalize output producing a [`Tag`] #[target_feature(enable = "avx2")] pub(crate) unsafe fn finalize(&mut self) -> Tag { assert!(self.num_cached_blocks < 4); let mut data = &self.cached_blocks[..]; // T ← R◦T // P = T_0 + T_1 + T_2 + T_3 let mut p = self .initialized .take() .map(|inner| (inner.p * inner.m).sum().reduce()); if self.num_cached_blocks >= 2 { // Compute 32 byte block (remaining data < 64 bytes) let mut c = Aligned2x130::from_blocks(data[..2].try_into().unwrap()); if let Some(p) = p { c = c + p; } p = Some(c.mul_and_sum(self.r1, self.r2).reduce()); data = &data[2..]; self.num_cached_blocks -= 2; } if self.num_cached_blocks == 1 { // Compute 16 byte block (remaining data < 32 bytes) let mut c = Aligned130::from_block(&data[0]); if let Some(p) = p { c = c + p; } p = Some((c * self.r1).reduce()); self.num_cached_blocks -= 1; } if let Some(block) = &self.partial_block { // Compute last block (remaining data < 16 bytes) let mut c = Aligned130::from_partial_block(block); if let Some(p) = p { c = c + p; } p = Some((c * self.r1).reduce()); } // Compute tag: p + k mod 2^128 let mut tag = GenericArray::::default(); let tag_int = if let Some(p) = p { self.k + p } else { self.k.into() }; tag_int.write(tag.as_mut_slice()); tag } } impl BlockSizeUser for State { type BlockSize = U16; } impl ParBlocksSizeUser for State { type ParBlocksSize = U4; } impl UhfBackend for State { fn proc_block(&mut self, block: &Block) { unsafe { self.compute_block(block, false) }; } fn proc_par_blocks(&mut self, blocks: &ParBlocks) { if self.num_cached_blocks == 0 { // Fast path. unsafe { self.compute_par_blocks(blocks) }; } else { // We are unaligned; use the slow fallback. for block in blocks { self.proc_block(block); } } } fn blocks_needed_to_align(&self) -> usize { if self.num_cached_blocks == 0 { // There are no cached blocks; fast path is available. 0 } else { // There are cached blocks; report how many more we need. self.cached_blocks.len() - self.num_cached_blocks } } } poly1305-0.8.0/src/backend/soft.rs000064400000000000000000000172550072674642500146660ustar 00000000000000//! Software implementation of the Poly1305 state machine. // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. // // This code originates from the rust-crypto project: // // // ...and was originally a port of Andrew Moons poly1305-donna // https://github.com/floodyberry/poly1305-donna use universal_hash::{ consts::{U1, U16}, crypto_common::{BlockSizeUser, ParBlocksSizeUser}, UhfBackend, UniversalHash, }; use crate::{Block, Key, Tag}; #[derive(Clone, Default)] pub(crate) struct State { r: [u32; 5], h: [u32; 5], pad: [u32; 4], } impl State { /// Initialize Poly1305 [`State`] with the given key pub(crate) fn new(key: &Key) -> State { let mut poly = State::default(); // r &= 0xffffffc0ffffffc0ffffffc0fffffff poly.r[0] = (u32::from_le_bytes(key[0..4].try_into().unwrap())) & 0x3ff_ffff; poly.r[1] = (u32::from_le_bytes(key[3..7].try_into().unwrap()) >> 2) & 0x3ff_ff03; poly.r[2] = (u32::from_le_bytes(key[6..10].try_into().unwrap()) >> 4) & 0x3ff_c0ff; poly.r[3] = (u32::from_le_bytes(key[9..13].try_into().unwrap()) >> 6) & 0x3f0_3fff; poly.r[4] = (u32::from_le_bytes(key[12..16].try_into().unwrap()) >> 8) & 0x00f_ffff; poly.pad[0] = u32::from_le_bytes(key[16..20].try_into().unwrap()); poly.pad[1] = u32::from_le_bytes(key[20..24].try_into().unwrap()); poly.pad[2] = u32::from_le_bytes(key[24..28].try_into().unwrap()); poly.pad[3] = u32::from_le_bytes(key[28..32].try_into().unwrap()); poly } /// Compute a Poly1305 block pub(crate) fn compute_block(&mut self, block: &Block, partial: bool) { let hibit = if partial { 0 } else { 1 << 24 }; let r0 = self.r[0]; let r1 = self.r[1]; let r2 = self.r[2]; let r3 = self.r[3]; let r4 = self.r[4]; let s1 = r1 * 5; let s2 = r2 * 5; let s3 = r3 * 5; let s4 = r4 * 5; let mut h0 = self.h[0]; let mut h1 = self.h[1]; let mut h2 = self.h[2]; let mut h3 = self.h[3]; let mut h4 = self.h[4]; // h += m h0 += (u32::from_le_bytes(block[0..4].try_into().unwrap())) & 0x3ff_ffff; h1 += (u32::from_le_bytes(block[3..7].try_into().unwrap()) >> 2) & 0x3ff_ffff; h2 += (u32::from_le_bytes(block[6..10].try_into().unwrap()) >> 4) & 0x3ff_ffff; h3 += (u32::from_le_bytes(block[9..13].try_into().unwrap()) >> 6) & 0x3ff_ffff; h4 += (u32::from_le_bytes(block[12..16].try_into().unwrap()) >> 8) | hibit; // h *= r let d0 = (u64::from(h0) * u64::from(r0)) + (u64::from(h1) * u64::from(s4)) + (u64::from(h2) * u64::from(s3)) + (u64::from(h3) * u64::from(s2)) + (u64::from(h4) * u64::from(s1)); let mut d1 = (u64::from(h0) * u64::from(r1)) + (u64::from(h1) * u64::from(r0)) + (u64::from(h2) * u64::from(s4)) + (u64::from(h3) * u64::from(s3)) + (u64::from(h4) * u64::from(s2)); let mut d2 = (u64::from(h0) * u64::from(r2)) + (u64::from(h1) * u64::from(r1)) + (u64::from(h2) * u64::from(r0)) + (u64::from(h3) * u64::from(s4)) + (u64::from(h4) * u64::from(s3)); let mut d3 = (u64::from(h0) * u64::from(r3)) + (u64::from(h1) * u64::from(r2)) + (u64::from(h2) * u64::from(r1)) + (u64::from(h3) * u64::from(r0)) + (u64::from(h4) * u64::from(s4)); let mut d4 = (u64::from(h0) * u64::from(r4)) + (u64::from(h1) * u64::from(r3)) + (u64::from(h2) * u64::from(r2)) + (u64::from(h3) * u64::from(r1)) + (u64::from(h4) * u64::from(r0)); // (partial) h %= p let mut c: u32; c = (d0 >> 26) as u32; h0 = d0 as u32 & 0x3ff_ffff; d1 += u64::from(c); c = (d1 >> 26) as u32; h1 = d1 as u32 & 0x3ff_ffff; d2 += u64::from(c); c = (d2 >> 26) as u32; h2 = d2 as u32 & 0x3ff_ffff; d3 += u64::from(c); c = (d3 >> 26) as u32; h3 = d3 as u32 & 0x3ff_ffff; d4 += u64::from(c); c = (d4 >> 26) as u32; h4 = d4 as u32 & 0x3ff_ffff; h0 += c * 5; c = h0 >> 26; h0 &= 0x3ff_ffff; h1 += c; self.h[0] = h0; self.h[1] = h1; self.h[2] = h2; self.h[3] = h3; self.h[4] = h4; } /// Finalize output producing a [`Tag`] pub(crate) fn finalize_mut(&mut self) -> Tag { // fully carry h let mut h0 = self.h[0]; let mut h1 = self.h[1]; let mut h2 = self.h[2]; let mut h3 = self.h[3]; let mut h4 = self.h[4]; let mut c: u32; c = h1 >> 26; h1 &= 0x3ff_ffff; h2 += c; c = h2 >> 26; h2 &= 0x3ff_ffff; h3 += c; c = h3 >> 26; h3 &= 0x3ff_ffff; h4 += c; c = h4 >> 26; h4 &= 0x3ff_ffff; h0 += c * 5; c = h0 >> 26; h0 &= 0x3ff_ffff; h1 += c; // compute h + -p let mut g0 = h0.wrapping_add(5); c = g0 >> 26; g0 &= 0x3ff_ffff; let mut g1 = h1.wrapping_add(c); c = g1 >> 26; g1 &= 0x3ff_ffff; let mut g2 = h2.wrapping_add(c); c = g2 >> 26; g2 &= 0x3ff_ffff; let mut g3 = h3.wrapping_add(c); c = g3 >> 26; g3 &= 0x3ff_ffff; let mut g4 = h4.wrapping_add(c).wrapping_sub(1 << 26); // select h if h < p, or h + -p if h >= p let mut mask = (g4 >> (32 - 1)).wrapping_sub(1); g0 &= mask; g1 &= mask; g2 &= mask; g3 &= mask; g4 &= mask; mask = !mask; h0 = (h0 & mask) | g0; h1 = (h1 & mask) | g1; h2 = (h2 & mask) | g2; h3 = (h3 & mask) | g3; h4 = (h4 & mask) | g4; // h = h % (2^128) h0 |= h1 << 26; h1 = (h1 >> 6) | (h2 << 20); h2 = (h2 >> 12) | (h3 << 14); h3 = (h3 >> 18) | (h4 << 8); // h = mac = (h + pad) % (2^128) let mut f: u64; f = u64::from(h0) + u64::from(self.pad[0]); h0 = f as u32; f = u64::from(h1) + u64::from(self.pad[1]) + (f >> 32); h1 = f as u32; f = u64::from(h2) + u64::from(self.pad[2]) + (f >> 32); h2 = f as u32; f = u64::from(h3) + u64::from(self.pad[3]) + (f >> 32); h3 = f as u32; let mut tag = Block::default(); tag[0..4].copy_from_slice(&h0.to_le_bytes()); tag[4..8].copy_from_slice(&h1.to_le_bytes()); tag[8..12].copy_from_slice(&h2.to_le_bytes()); tag[12..16].copy_from_slice(&h3.to_le_bytes()); tag } } #[cfg(feature = "zeroize")] impl Drop for State { fn drop(&mut self) { use zeroize::Zeroize; self.r.zeroize(); self.h.zeroize(); self.pad.zeroize(); } } impl BlockSizeUser for State { type BlockSize = U16; } impl ParBlocksSizeUser for State { type ParBlocksSize = U1; } impl UhfBackend for State { fn proc_block(&mut self, block: &Block) { self.compute_block(block, false); } } impl UniversalHash for State { fn update_with_backend( &mut self, f: impl universal_hash::UhfClosure, ) { f.call(self); } /// Finalize output producing a [`Tag`] fn finalize(mut self) -> Tag { self.finalize_mut() } } poly1305-0.8.0/src/backend.rs000064400000000000000000000004420072674642500137010ustar 00000000000000//! Poly1305 backends #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), not(poly1305_force_soft) ))] pub(crate) mod avx2; #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), not(poly1305_force_soft) ))] pub(crate) mod autodetect; pub(crate) mod soft; poly1305-0.8.0/src/fuzz/id=000000,sig=06,src=000014,op=flip4,pos=11000064400000000000000000000007040072674642500210730ustar 00000000000000aaW$1 D'm'?`W$annn$aaaa~aaacaqAaaNa1Laaa$aaFa~aaacaaA?aNaduaaaMagggggggggUnUua]LLLLhLLLLLLLLLL ]]]]]]]]aa g'XdLLLLL_1 **a D1 ]YQDʽʻn]' D'ʽʻU] '1 *Maaaa@K>]# "L j'm'?`W$*La ??y @ipoly1305-0.8.0/src/fuzz/id=000001,sig=06,src=000006+000014,op=splice,rep=64000064400000000000000000000006620072674642500220560ustar 00000000000000`W$*ggggg((&]]]]s]aD g'XdLyLLL_1 *"aaaduaaaMatgggggggggUnUua]LLL ]]]]]]s]aD g'XdLyLLL_1 **a Dd1 ]Yʽ ʻUL**MaaaLL@KE]]]]]a*𰰰 j'm'?`W$*ggggggggg:nUua]LLa ̽W?y@ipoly1305-0.8.0/src/fuzz/id=000002,sig=06,src=000008+000014,op=splice,rep=32000064400000000000000000000006600072674642500220520ustar 00000000000000 baaahaaaaaa]?f@[G]Y _$R[>$]3>L WzYW$"L   nLLL ]]]]~]]]aa g'XdLLLLL**a D1 ]YQDʾʽʻn]' D'ʽʻ"U '1  Maaaa@K>]ހ"@ j' m'?`W$*La ??y@ipoly1305-0.8.0/src/fuzz/id=000003,sig=06,src=000003,op=havoc,rep=64000064400000000000000000000003000072674642500211430ustar 00000000000000+4[$@r @t[$r t[$} @poly1305-0.8.0/src/fuzz/id=000004,sig=06,src=000022+000005,op=splice,rep=32000064400000000000000000000023100072674642500220420ustar 00000000000000poly1305-0.8.0/src/fuzz/id=000005,sig=06,src=000008+000007,op=splice,rep=128000064400000000000000000000600220072674642500221430ustar 00000000000000d@ JJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJJ\poly1305-0.8.0/src/fuzz/id=000006,sig=06,src=000005,op=havoc,rep=8000064400000000000000000000000440072674642500210730ustar 00000000000000poly1305-0.8.0/src/fuzz/id=000007,sig=06,src=000024+000000,op=splice,rep=64000064400000000000000000000063140072674642500220570ustar 00000000000000  poly1305-0.8.0/src/fuzz/id=000008,sig=06,src=000019,time=165655+000011,op=splice,rep=128000064400000000000000000000000600072674642500236630ustar 00000000000000o@ppoly1305-0.8.0/src/fuzz.rs000064400000000000000000000157220072674642500133170ustar 00000000000000use universal_hash::{generic_array::GenericArray, UniversalHash}; use crate::{backend, Block, Key, BLOCK_SIZE}; /// Helper function for fuzzing the AVX2 backend. pub fn fuzz_avx2(key: &Key, data: &[u8]) { let mut avx2 = backend::avx2::State::new(key); let mut soft = backend::soft::State::new(key); for (_i, chunk) in data.chunks(BLOCK_SIZE).enumerate() { if chunk.len() == BLOCK_SIZE { let block = GenericArray::from_slice(chunk); unsafe { avx2.compute_block(block, false); } soft.compute_block(block, false); } else { let mut block = Block::default(); block[..chunk.len()].copy_from_slice(chunk); block[chunk.len()] = 1; unsafe { avx2.compute_block(&block, true); } soft.compute_block(&block, true); } // Check that the same tag would be derived after each chunk. // We add the chunk number to the assertion for debugging. // When fuzzing, we skip this check, and just look at the end. #[cfg(test)] assert_eq!( (_i + 1, unsafe { avx2.clone().finalize() }), (_i + 1, soft.clone().finalize()), ); } assert_eq!(unsafe { avx2.finalize() }, soft.finalize()); } fn avx2_fuzzer_test_case(data: &[u8]) { fuzz_avx2(data[0..32].into(), &data[32..]); } #[test] fn crash_0() { avx2_fuzzer_test_case(include_bytes!( "fuzz/id=000000,sig=06,src=000014,op=flip4,pos=11" )); } #[test] fn crash_1() { avx2_fuzzer_test_case(include_bytes!( "fuzz/id=000001,sig=06,src=000006+000014,op=splice,rep=64" )); } #[test] fn crash_2() { avx2_fuzzer_test_case(include_bytes!( "fuzz/id=000002,sig=06,src=000008+000014,op=splice,rep=32" )); } #[test] fn crash_3() { // This input corresponds to a key of: // r = 0x0f245bfc0f7fe5fc0fffff3400fb1c2b // s = 0xffffff000001000040f6fff5ffffffff // // and input blocks: // [0x01ea0010000a00ff108b72ffffffffffff, 0x01ffffffff245b74ff7fe5ffffff0040ff, // 0x01000a00ff108b7200ff04000002ffffff, 0x01ffffffffffffffffffff0000ffea0010, // 0x0180ffffffffffffffffffffffe3ffffff, 0x01ffffffffffffffffffffffffffffffff, // 0x01ffffffffffffffffffdfffff03ffffff, 0x01ffffffffff245b74ff7fe5ffffe4ffff, // 0x0112118b7d00ffeaffffffffffffffffff, 0x010e40eb10ffffffff1edd7f0010000a00] // // When this crash occurred, the software and AVX2 backends would generate the same // tags given the first seven blocks as input. Given the first eight blocks, the // following tags were generated: // // | tag | low 128 bits of final accumulator // soft | 0x0004d01b9168ded528a9b541cc461988 - s = 0x0004d11b9167ded4e7b2b54bcc461989 // avx2 | 0x0004d01b9168ded528a9b540cc461988 - s = 0x0004d11b9167ded4e7b2b54acc461989 // difference = 0x0100000000 // // This discrepancy was due to Unreduced130::reduce (as called during finalization) // not correctly reducing. During the reduction step, the upper limb's upper bits // (beyond 2^130) are added into the lower limb multiplied by 5 (for reduction modulo // 2^130 - 5). This is computed like so: // // b = t_4 >> 26 // t_0 += b + (b << 2) // // It is possible for the upper limb to be 57+ bits; thus b << 2 can be 33+ bits. // However, the original reduction code was using _mm256_slli_epi32, which shifts // packed 32-bit integers; this was causing the upper bits of b to be lost. Switching // to _mm256_slli_epi64 (correctly treating b as a 64-bit field) solves the problem. avx2_fuzzer_test_case(include_bytes!( "fuzz/id=000003,sig=06,src=000003,op=havoc,rep=64" )); } #[test] fn crash_4() { avx2_fuzzer_test_case(include_bytes!( "fuzz/id=000004,sig=06,src=000022+000005,op=splice,rep=32" )); } #[test] fn crash_5() { avx2_fuzzer_test_case(include_bytes!( "fuzz/id=000005,sig=06,src=000008+000007,op=splice,rep=128" )); } #[test] fn crash_6() { // This input corresponds to a key of: // r = 0x04040404040404040404040404040404 // s = 0x0404040403ef04040404040404040404 // // and input: // [0x04, 0x04, 0x04, 0xf2] // // The input fits into a single short block: // m = 0x01f2040404 // // and we should have the following computation: // tag = ((m * r) % p) + s // = ((0x01f2040404 * 0x04040404040404040404040404040404) % p) + s // = (0x7cfdfeffffffffffffffffffffffffff8302010 % ((1 << 130) - 5)) + s // = 0x1f3f7fc + 0x0404040403ef04040404040404040404 // = 0x0404040403ef04040404040405f7fc00 // // or in bytes: // tag = [ // 0x00, 0xfc, 0xf7, 0x05, 0x04, 0x04, 0x04, 0x04, // 0x04, 0x04, 0xef, 0x03, 0x04, 0x04, 0x04, 0x04, // ]; // // The crash was caused by the final modular reduction (in the `addkey` method of the // Goll-Gueron implementation, and `impl Add for AdditionKey` here) not // fully carrying all bits. `Aligned130` is guaranteed to be a 130-bit integer, but is // not guaranteed to be an integer modulo 2^130 - 5. avx2_fuzzer_test_case(include_bytes!( "fuzz/id=000006,sig=06,src=000005,op=havoc,rep=8" )); } #[test] fn crash_7() { avx2_fuzzer_test_case(include_bytes!( "fuzz/id=000007,sig=06,src=000024+000000,op=splice,rep=64" )); } #[test] fn crash_8() { // This input corresponds to a key of: // r = 0x0fff00fc0000000000000000006f91ab // s = 0xffffffffffffffffffffffffffffffff // // and a single input block: // 0x01d4d4ffffffffffffffffffffffffffff // // We should have the following computation: // tag = ((m * r) % p) + s // = ((0x01d4d4ffffffffffffffffffffffffffff * 0x0fff00fc0000000000000000006f91ab) % p) + s // = (0x1d4b7cf881ac00000000000000cc5320bf47ff03ffffffffffffffffff906e55 % ((1 << 130) - 5)) + s // = 0xe3e65b3aa217000000000000008fd63d + 0xffffffffffffffffffffffffffffffff // = 0x01e3e65b3aa217000000000000008fd63c mod 128 // // or in bytes: // tag = [ // 0x3c, 0xd6, 0x8f, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x00, 0x00, 0x17, 0xa2, 0x3a, 0x5b, 0xe6, 0xe3, // ]; // // The crash was caused by the final modular reduction (in the `addkey` method of the // Goll-Gueron implementation, and `impl Add for AdditionKey` here). After // adding s, limbs 0 and 2 have carries, while limb 1 is 0xffffffff. The original // implementation only carried once, after which limb 1 has a carry, which was then // discarded. The fix was to always carry three times, to ensure that all potential // carry bits are carried. avx2_fuzzer_test_case(include_bytes!( "fuzz/id=000008,sig=06,src=000019,time=165655+000011,op=splice,rep=128" )); } poly1305-0.8.0/src/lib.rs000064400000000000000000000117120072674642500130620ustar 00000000000000//! The Poly1305 universal hash function and message authentication code. //! //! # About //! //! Poly1305 is a universal hash function suitable for use as a one-time //! authenticator and, when combined with a cipher, a message authentication //! code (MAC). //! //! It takes a 32-byte one-time key and a message and produces a 16-byte tag, //! which can be used to authenticate the message. //! //! Poly1305 is primarily notable for its use in the [`ChaCha20Poly1305`] and //! [`XSalsa20Poly1305`] authenticated encryption algorithms. //! //! # Minimum Supported Rust Version //! //! Rust **1.56** or higher. //! //! Minimum supported Rust version may be changed in the future, but such //! changes will be accompanied with a minor version bump. //! //! # Security Notes //! //! This crate has received one [security audit by NCC Group][audit], with no //! significant findings. We would like to thank [MobileCoin] for funding the //! audit. //! //! NOTE: the audit predates the AVX2 backend, which has not yet been audited. //! //! All implementations contained in the crate are designed to execute in constant //! time, either by relying on hardware intrinsics (e.g. AVX2 on x86/x86_64), or //! using a portable implementation which is only constant time on processors which //! implement constant-time multiplication. //! //! It is not suitable for use on processors with a variable-time multiplication //! operation (e.g. short circuit on multiply-by-zero / multiply-by-one, such as //! certain 32-bit PowerPC CPUs and some non-ARM microcontrollers). //! //! [`ChaCha20Poly1305`]: https://docs.rs/chacha20poly1305 //! [`XSalsa20Poly1305`]: https://docs.rs/xsalsa20poly1305 //! [audit]: https://research.nccgroup.com/2020/02/26/public-report-rustcrypto-aes-gcm-and-chacha20poly1305-implementation-review/ //! [MobileCoin]: https://mobilecoin.com #![no_std] #![doc( html_logo_url = "https://raw.githubusercontent.com/RustCrypto/media/8f1a9894/logo.svg", html_favicon_url = "https://raw.githubusercontent.com/RustCrypto/media/8f1a9894/logo.svg" )] #![warn(missing_docs, rust_2018_idioms)] #[cfg(feature = "std")] extern crate std; pub use universal_hash; use universal_hash::{ consts::{U16, U32}, crypto_common::{BlockSizeUser, KeySizeUser}, generic_array::GenericArray, KeyInit, UniversalHash, }; mod backend; #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), not(poly1305_force_soft), target_feature = "avx2", // Fuzz tests bypass AVX2 autodetection code any(fuzzing, test) ))] mod fuzz; #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), not(poly1305_force_soft) ))] use crate::backend::autodetect::State; #[cfg(not(all( any(target_arch = "x86", target_arch = "x86_64"), not(poly1305_force_soft) )))] use crate::backend::soft::State; /// Size of a Poly1305 key pub const KEY_SIZE: usize = 32; /// Size of the blocks Poly1305 acts upon pub const BLOCK_SIZE: usize = 16; /// Poly1305 keys (32-bytes) pub type Key = universal_hash::Key; /// Poly1305 blocks (16-bytes) pub type Block = universal_hash::Block; /// Poly1305 tags (16-bytes) pub type Tag = universal_hash::Block; /// The Poly1305 universal hash function. /// /// Note that Poly1305 is not a traditional MAC and is single-use only /// (a.k.a. "one-time authenticator"). /// /// For this reason it doesn't impl the `crypto_mac::Mac` trait. #[derive(Clone)] pub struct Poly1305 { state: State, } impl KeySizeUser for Poly1305 { type KeySize = U32; } impl KeyInit for Poly1305 { /// Initialize Poly1305 with the given key fn new(key: &Key) -> Poly1305 { Poly1305 { state: State::new(key), } } } impl BlockSizeUser for Poly1305 { type BlockSize = U16; } impl UniversalHash for Poly1305 { fn update_with_backend( &mut self, f: impl universal_hash::UhfClosure, ) { self.state.update_with_backend(f); } /// Get the hashed output fn finalize(self) -> Tag { self.state.finalize() } } impl Poly1305 { /// Compute unpadded Poly1305 for the given input data. /// /// The main use case for this is XSalsa20Poly1305. pub fn compute_unpadded(mut self, data: &[u8]) -> Tag { for chunk in data.chunks(BLOCK_SIZE) { if chunk.len() == BLOCK_SIZE { let block = GenericArray::from_slice(chunk); self.state.compute_block(block, false); } else { let mut block = Block::default(); block[..chunk.len()].copy_from_slice(chunk); block[chunk.len()] = 1; self.state.compute_block(&block, true) } } self.state.finalize() } } opaque_debug::implement!(Poly1305); #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), not(poly1305_force_soft), target_feature = "avx2", // Fuzz tests bypass AVX2 autodetection code any(fuzzing, test) ))] pub use crate::fuzz::fuzz_avx2; poly1305-0.8.0/tests/lib.rs000064400000000000000000000072060072674642500134400ustar 00000000000000use hex_literal::hex; use poly1305::{ universal_hash::{KeyInit, UniversalHash}, Block, Poly1305, BLOCK_SIZE, KEY_SIZE, }; use std::iter::repeat; #[test] fn test_nacl_vector() { let key = hex!("eea6a7251c1e72916d11c2cb214d3c252539121d8e234e652d651fa4c8cff880"); let msg = hex!( "8e993b9f48681273c29650ba32fc76ce 48332ea7164d96a4476fb8c531a1186a c0dfc17c98dce87b4da7f011ec48c972 71d2c20f9b928fe2270d6fb863d51738 b48eeee314a7cc8ab932164548e526ae 90224368517acfeabd6bb3732bc0e9da 99832b61ca01b6de56244a9e88d5f9b3 7973f622a43d14a6599b1f654cb45a74 e355a5" ); let expected = hex!("f3ffc7703f9400e52a7dfb4b3d3305d9"); let result1 = Poly1305::new(key.as_ref().into()).compute_unpadded(&msg); assert_eq!(&expected[..], result1.as_slice()); } #[test] fn donna_self_test1() { // This gives r = 2 and s = 0. let key = hex!("0200000000000000000000000000000000000000000000000000000000000000"); // This results in a 130-bit integer with the lower 129 bits all set: m = (1 << 129) - 1 let msg = hex!("ffffffffffffffffffffffffffffffff"); // The input is a single block, so we should have the following computation: // tag = ((m * r) % p) + s // = ((((1 << 129) - 1) * 2) % p) + 0 // = ((1 << 130) - 2) % (1 << 130) - 5 // = 3 let expected = hex!("03000000000000000000000000000000"); let mut poly = Poly1305::new(key.as_ref().into()); poly.update(&[Block::clone_from_slice(msg.as_ref())]); assert_eq!(&expected[..], poly.finalize().as_slice()); } #[test] fn donna_self_test2() { let total_key = hex!("01020304050607fffefdfcfbfaf9ffffffffffffffffffffffffffff00000000"); let total_mac = hex!("64afe2e8d6ad7bbdd287f97c44623d39"); let mut tpoly = Poly1305::new(total_key.as_ref().into()); for i in 0..256 { let mut key = [0u8; KEY_SIZE]; key.copy_from_slice(&repeat(i as u8).take(KEY_SIZE).collect::>()); let msg: Vec = repeat(i as u8).take(256).collect(); let tag = Poly1305::new(key.as_ref().into()).compute_unpadded(&msg[..i]); tpoly.update(&[tag.into()]); } assert_eq!(&total_mac[..], tpoly.finalize().as_slice()); } #[test] fn test_tls_vectors() { // from http://tools.ietf.org/html/draft-agl-tls-chacha20poly1305-04 let key = b"this is 32-byte key for Poly1305"; let msg = [0u8; 32]; let expected = hex!("49ec78090e481ec6c26b33b91ccc0307"); let mut poly = Poly1305::new(key.as_ref().into()); let blocks = msg .chunks(BLOCK_SIZE) .map(|chunk| Block::clone_from_slice(chunk)) .collect::>(); poly.update(&blocks); assert_eq!(&expected[..], poly.finalize().as_slice()); } #[test] fn test_rfc7539_vector() { // From let key = hex!("85d6be7857556d337f4452fe42d506a80103808afb0db2fd4abff6af4149f51b"); let msg = hex!("43727970746f6772617068696320466f72756d2052657365617263682047726f7570"); let expected = hex!("a8061dc1305136c6c22b8baf0c0127a9"); let result = Poly1305::new(key.as_ref().into()).compute_unpadded(&msg); assert_eq!(&expected[..], result.as_slice()); } #[test] fn padded_input() { // poly1305 key and AAD from let key = hex!("7bac2b252db447af09b67a55a4e955840ae1d6731075d9eb2a9375783ed553ff"); let msg = hex!("50515253c0c1c2c3c4c5c6c7"); let expected = hex!("ada56caa480fe6f5067039244a3d76ba"); let mut poly = Poly1305::new(key.as_ref().into()); poly.update_padded(&msg); assert_eq!(&expected[..], poly.finalize().as_slice()); }